From d688dff83d7e6fda3d1ad6f4eb5b1474b9ddc626 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Wed, 1 Apr 2026 11:48:16 -0500 Subject: [PATCH] Added CHANGELOG page to the site and removed legacy mkdocs structure --- _quarto.yml | 2 + dev/build.sh | 63 ----- dev/cleanup.sh | 13 - development/changelog.qmd | 1 + docs/about.md | 103 -------- docs/development/contributing.md | 274 -------------------- docs/development/index.md | 7 - docs/development/new-models.md | 271 ------------------- docs/development/roadmap.md | 21 -- docs/getting-started.md | 182 ------------- docs/index.md | 53 ---- docs/javascripts/mathjax.js | 19 -- docs/python_docs/api/bart.md | 6 - docs/python_docs/api/bcf.md | 6 - docs/python_docs/api/index.md | 13 - docs/python_docs/api/low-level/dataset.md | 14 - docs/python_docs/api/low-level/forest.md | 11 - docs/python_docs/api/low-level/index.md | 11 - docs/python_docs/api/low-level/sampler.md | 16 -- docs/python_docs/api/low-level/utilities.md | 16 -- docs/python_docs/api/sklearn.md | 11 - docs/python_docs/demo/index.md | 12 - docs/python_docs/index.md | 6 - docs/vignettes/Python/.gitkeep | 0 docs/vignettes/R/.gitkeep | 0 docs/vignettes/index.md | 12 - mkdocs.yml | 140 ---------- 27 files changed, 3 insertions(+), 1280 deletions(-) delete mode 100644 dev/build.sh delete mode 100644 dev/cleanup.sh create mode 100644 development/changelog.qmd delete mode 100644 docs/about.md delete mode 100644 docs/development/contributing.md delete mode 100644 docs/development/index.md delete mode 100644 docs/development/new-models.md delete mode 100644 docs/development/roadmap.md delete mode 100644 docs/getting-started.md delete mode 100644 docs/index.md delete mode 100644 docs/javascripts/mathjax.js delete mode 100644 docs/python_docs/api/bart.md delete mode 100644 docs/python_docs/api/bcf.md delete mode 100644 docs/python_docs/api/index.md delete mode 100644 docs/python_docs/api/low-level/dataset.md delete mode 100644 docs/python_docs/api/low-level/forest.md delete mode 100644 docs/python_docs/api/low-level/index.md delete mode 100644 docs/python_docs/api/low-level/sampler.md delete mode 100644 docs/python_docs/api/low-level/utilities.md delete mode 100644 docs/python_docs/api/sklearn.md delete mode 100644 docs/python_docs/demo/index.md delete mode 100644 docs/python_docs/index.md delete mode 100644 docs/vignettes/Python/.gitkeep delete mode 100644 docs/vignettes/R/.gitkeep delete mode 100644 docs/vignettes/index.md delete mode 100644 mkdocs.yml diff --git a/_quarto.yml b/_quarto.yml index 3d3e98e4a..fee08d58a 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -40,6 +40,8 @@ website: href: development/contributing.qmd - text: Adding New Models href: development/new-models.qmd + - text: Changelog + href: development/changelog.qmd - text: Roadmap href: development/roadmap.qmd diff --git a/dev/build.sh b/dev/build.sh deleted file mode 100644 index d0a732b1c..000000000 --- a/dev/build.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash - -# Clone stochtree repo -git clone --recursive git@github.com:StochasticTree/stochtree.git stochtree_repo - -# Set up python virtual environment and dependencies -python -m venv venv -source venv/bin/activate -pip install --upgrade pip -pip install -r requirements.txt - -# Install python package -cd stochtree_repo -pip install . -cd .. - -# Build the C++ doxygen output -sed -i '' 's|^OUTPUT_DIRECTORY *=.*|OUTPUT_DIRECTORY = ../docs/cpp_docs/|' stochtree_repo/Doxyfile -sed -i '' 's|^GENERATE_XML *=.*|GENERATE_XML = NO|' stochtree_repo/Doxyfile -sed -i '' 's|^GENERATE_HTML *=.*|GENERATE_HTML = YES|' stochtree_repo/Doxyfile -mkdir -p docs/cpp_docs/ -cd stochtree_repo -doxygen Doxyfile -cd .. - -# Install R package dependencies -Rscript -e 'install.packages(c("remotes", "devtools", "roxygen2", "ggplot2", "latex2exp", "decor", "pkgdown", "cpp11", "BH", "doParallel", "foreach", "knitr", "Matrix", "MASS", "mvtnorm", "rmarkdown", "testthat", "tgp"), repos="https://cloud.r-project.org/")' - -# Build the R package doc site -cd stochtree_repo -Rscript cran-bootstrap.R 1 1 1 -cd .. -mkdir -p docs/R_docs/pkgdown -Rscript -e 'pkgdown::build_site_github_pages("stochtree_repo/stochtree_cran", dest_dir = "../../docs/R_docs/pkgdown", install = TRUE)' -rm -rf stochtree_repo/stochtree_cran - -# Copy Jupyter notebook demos over to docs directory -cp stochtree_repo/demo/notebooks/supervised_learning.ipynb docs/python_docs/demo/supervised_learning.ipynb -cp stochtree_repo/demo/notebooks/causal_inference.ipynb docs/python_docs/demo/causal_inference.ipynb -cp stochtree_repo/demo/notebooks/heteroskedastic_supervised_learning.ipynb docs/python_docs/demo/heteroskedastic_supervised_learning.ipynb -cp stochtree_repo/demo/notebooks/multivariate_treatment_causal_inference.ipynb docs/python_docs/demo/multivariate_treatment_causal_inference.ipynb -cp stochtree_repo/demo/notebooks/reparameterized_causal_inference.ipynb docs/python_docs/demo/reparameterized_causal_inference.ipynb -cp stochtree_repo/demo/notebooks/serialization.ipynb docs/python_docs/demo/serialization.ipynb -cp stochtree_repo/demo/notebooks/tree_inspection.ipynb docs/python_docs/demo/tree_inspection.ipynb -cp stochtree_repo/demo/notebooks/summary.ipynb docs/python_docs/demo/summary.ipynb -cp stochtree_repo/demo/notebooks/ordinal_outcome.ipynb docs/python_docs/demo/ordinal_outcome.ipynb -cp stochtree_repo/demo/notebooks/prototype_interface.ipynb docs/python_docs/demo/prototype_interface.ipynb -cp stochtree_repo/demo/notebooks/sklearn_wrappers.ipynb docs/python_docs/demo/sklearn_wrappers.ipynb -cp stochtree_repo/demo/notebooks/multi_chain.ipynb docs/python_docs/demo/multi_chain.ipynb - -# Copy static vignettes over to docs directory -cp vignettes/Python/RDD/rdd.html docs/vignettes/Python/rdd.html -cp vignettes/Python/RDD/RDD_DAG.png docs/vignettes/Python/RDD_DAG.png -cp vignettes/Python/RDD/trees1.png docs/vignettes/Python/trees1.png -cp vignettes/Python/RDD/trees2.png docs/vignettes/Python/trees2.png -cp vignettes/Python/RDD/trees3.png docs/vignettes/Python/trees3.png -cp vignettes/R/RDD/rdd.html docs/vignettes/R/rdd.html -cp vignettes/Python/IV/iv.html docs/vignettes/Python/iv.html -cp vignettes/Python/IV/IV_CDAG.png docs/vignettes/Python/IV_CDAG.png -cp vignettes/R/IV/iv.html docs/vignettes/R/iv.html - -# Build the doc site -mkdocs build diff --git a/dev/cleanup.sh b/dev/cleanup.sh deleted file mode 100644 index 41cc52e2c..000000000 --- a/dev/cleanup.sh +++ /dev/null @@ -1,13 +0,0 @@ -# Remove the stochtree_repo subfolder -rm -rf stochtree_repo - -# Remove venv -rm -rf venv - -# Remove jupyter notebooks from the docs/ directory -rm -f docs/python_docs/demo/*.ipynb - -# Remove Python / R vignettes -rm -f docs/vignettes/Python/*.html -rm -f docs/vignettes/Python/*.png -rm -f docs/vignettes/R/*.html diff --git a/development/changelog.qmd b/development/changelog.qmd new file mode 100644 index 000000000..ae0749d04 --- /dev/null +++ b/development/changelog.qmd @@ -0,0 +1 @@ +{{< include ../stochtree_repo/CHANGELOG.md >}} diff --git a/docs/about.md b/docs/about.md deleted file mode 100644 index 8ad67a513..000000000 --- a/docs/about.md +++ /dev/null @@ -1,103 +0,0 @@ -# Overview of Stochastic Tree Models - -Stochastic tree models are a powerful addition to your modeling toolkit. -As with many machine learning methods, understanding these models in depth is an involved task. - -There are many excellent published papers on stochastic tree models -(to name a few, the [original BART paper](https://projecteuclid.org/journals/annals-of-applied-statistics/volume-4/issue-1/BART-Bayesian-additive-regression-trees/10.1214/09-AOAS285.full), -[the XBART paper](https://www.tandfonline.com/doi/full/10.1080/01621459.2021.1942012), -and [the BCF paper](https://projecteuclid.org/journals/bayesian-analysis/volume-15/issue-3/Bayesian-Regression-Tree-Models-for-Causal-Inference--Regularization-Confounding/10.1214/19-BA1195.full)). -Here, we aim to build up an abbreviated intuition for these models from their conceptually-simple building blocks. - -## Notation - -We're going to introduce some notation to make these concepts precise. -In a traditional supervised learning setting, we hope to predict some **outcome** from **features** in a training dataset. -We'll call the outcome $y$ and the features $X$. -Our goal is to come up with a function $f$ that predicts the outcome $y$ as well as possible from $X$ alone. - -## Decision Trees - -[Decision tree learning](https://en.wikipedia.org/wiki/Decision_tree_learning) is a simple machine learning method that -constructs a function $f$ from a series of conditional statements. Consider the tree below. - -```mermaid -stateDiagram-v2 - state split_one <> - state split_two <> - split_one --> split_two: if x1 <= 1 - split_one --> c : if x1 > 1 - split_two --> a: if x2 <= -2 - split_two --> b : if x2 > -2 -``` - -We evaluate two conditional statments (`X[,1] > 1` and `X[,2] > -2`), arranged in a tree-like sequence of branches, -which determine whether the model predicts `a`, `b`, or `c`. We could similarly express this tree in math notation as - -\begin{equation*} -f(X_i) = \begin{cases} -a & ; \;\;\; X_{i,1} \leq 1, \;\; X_{i,2} \leq -2\\ -b & ; \;\;\; X_{i,1} \leq 1, \;\; X_{i,2} > -2\\ -c & ; \;\;\; X_{i,1} > 1 -\end{cases} -\end{equation*} - -We won't belabor the discussion of trees as there are many good textbooks and online articles on the topic, -but we'll close by noting that training decision trees introduces a delicate balance between -[overfitting and underfitting](https://en.wikipedia.org/wiki/Overfitting). -Simple trees like the one above do not capture much complexity in a dataset and may potentially be underfit -while deep, complex trees are vulnerable to overfitting and tend to have high variance. - -## Boosted Decision Tree Ensembles - -One way to address the overfitting-underfitting tradeoff of decision trees is to build an "ensemble" of decision -trees, so that the function $f$ is defined by a sum of $k$ individual decision trees $g_i$ - -\begin{equation*} -f(X_i) = g_1(X_i) + \dots + g_k(X_i) -\end{equation*} - -There are several ways to train an ensemble of decision trees (sometimes called "forests"), the most popular of which are [random forests](https://en.wikipedia.org/wiki/Random_forest) and -[gradient boosting](https://en.wikipedia.org/wiki/Gradient_boosting). Their main difference is that random forests train -all $m$ trees independently of one another, while boosting trains trees sequentially, so that tree $j$ depends on the result of training trees 1 through $j-1$. -Libraries like [xgboost](https://xgboost.readthedocs.io/en/stable/) and [LightGBM](https://lightgbm.readthedocs.io/en/latest/) are popular examples of boosted tree ensembles. - -Tree ensembles often [outperform neural networks and other machine learning methods on tabular datasets](https://arxiv.org/abs/2207.08815), -but classic tree ensemble methods return a single estimated function $f$, without expressing uncertainty around its estimates. - -## Stochastic Tree Ensembles - -[Stochastic](https://en.wikipedia.org/wiki/Stochastic) tree ensembles differ from their classical counterparts in their use of randomness in learning a function. -Rather than returning a single "best" tree ensemble, stochastic tree ensembles return a range of tree ensembles that fit the data well. -Mechanically, it's useful to think of "sampling" -- rather than "fitting" -- a stochastic tree ensemble model. - -Why is this useful? Suppose we've sampled $m$ forests. For each observation $i$, we obtain $m$ predictions: $[f_1(X_i), \dots, f_m(X_i)]$. -From this "dataset" of predictions, we can compute summary statistics, where a mean or median would give something akin to the predictions of an xgboost or lightgbm model, -and the $\alpha$ and $1-\alpha$ quantiles give a [credible interval](https://en.wikipedia.org/wiki/Credible_interval). - -Rather than explain each of the models that `stochtree` supports in depth here, we provide a high-level overview, with pointers to the relevant literature. - -### Supervised Learning - -The [`bart`](R_docs/pkgdown/reference/bart.html) R function and the [`BARTModel`](python_docs/api/bart.md) python class are the primary interface for supervised -prediction tasks in `stochtree`. The primary references for these models are -[BART (Chipman, George, McCulloch 2010)](https://projecteuclid.org/journals/annals-of-applied-statistics/volume-4/issue-1/BART-Bayesian-additive-regression-trees/10.1214/09-AOAS285.full) and -[XBART (He and Hahn 2021)](https://www.tandfonline.com/doi/full/10.1080/01621459.2021.1942012). - -In addition to the standard BART / XBART models, in which each tree's leaves return a constant prediction, `stochtree` also supports -arbitrary leaf regression on a user-provided basis (i.e. an expanded version of [Chipman et al 2002](https://link.springer.com/article/10.1023/A:1013916107446) and [Gramacy and Lee 2012](https://www.tandfonline.com/doi/abs/10.1198/016214508000000689)). - -### Causal Inference - -The [`bcf`](R_docs/pkgdown/reference/bcf.html) R function and the [`BCFModel`](python_docs/api/bcf.md) python class are the primary interface for causal effect -estimation in `stochtree`. The primary references for these models are -[BCF (Hahn, Murray, Carvalho 2021)](https://projecteuclid.org/journals/bayesian-analysis/volume-15/issue-3/Bayesian-Regression-Tree-Models-for-Causal-Inference--Regularization-Confounding/10.1214/19-BA1195.full) and -[XBCF (Krantsevich, He, Hahn 2022)](https://arxiv.org/abs/2209.06998). - -### Additional Modeling Features - -Both the BART and BCF interfaces in `stochtree` support the following extensions: - -* Accelerated / "warm-start" sampling of forests (i.e. [He and Hahn 2021](https://www.tandfonline.com/doi/full/10.1080/01621459.2021.1942012)) -* Forest-based heteroskedasticity (i.e. [Murray 2021](https://www.tandfonline.com/doi/abs/10.1080/01621459.2020.1813587)) -* Additive random effects (i.e. [Gelman et al 2008](https://www.tandfonline.com/doi/abs/10.1198/106186008X287337)) diff --git a/docs/development/contributing.md b/docs/development/contributing.md deleted file mode 100644 index 3e519bc87..000000000 --- a/docs/development/contributing.md +++ /dev/null @@ -1,274 +0,0 @@ -# Contributing - -`stochtree` is hosted on [Github](https://github.com/StochasticTree/stochtree/). -Any feedback, requests, or bug reports can be submitted as [issues](https://github.com/StochasticTree/stochtree/issues). -Moreover, if you have ideas for how to improve stochtree, we welcome [pull requests](https://github.com/StochasticTree/stochtree/pulls). - -## Building StochTree - -Any local stochtree development will require cloning the repository from Github. -If you don't have git installed, you can do so following [these instructions](https://learn.microsoft.com/en-us/devops/develop/git/install-and-set-up-git). - -Once git is available at the command line, navigate to the folder that will store this project (in bash / zsh, this is done by running `cd` followed by the path to the directory). -Then, clone the `stochtree` repo as a subfolder by running -```{bash} -git clone --recursive https://github.com/StochasticTree/stochtree.git -``` - -*NOTE*: this project incorporates several C++ dependencies as [git submodules](https://git-scm.com/book/en/v2/Git-Tools-Submodules), -which is why the `--recursive` flag is necessary. If you have already cloned the repo without the `--recursive` flag, -you can retrieve the submodules recursively by running `git submodule update --init --recursive` in the main repo directory. - -### R - -This section will detail how to use RStudio to build and make changes to stochtree. There are other tools that are useful for R -package development (for example, [Positron](https://github.com/posit-dev/positron), [VS Code](https://code.visualstudio.com/docs/languages/r), -and [ESS](https://ess.r-project.org/)), but we will focus on RStudio in this walkthrough. - -Once you've cloned the stochtree repository, follow these steps to build stochtree: - -1. [Create an RStudio project in the stochtree directory](https://support.posit.co/hc/en-us/articles/200526207-Using-RStudio-Projects) -2. [Build the package in RStudio](https://docs.posit.co/ide/user/ide/guide/pkg-devel/writing-packages.html#building-a-package) - -Note that due to the complicated folder structure of the stochtree repo, step 2 might not work out of the box on all platforms. -If stochtree fails to build, you can use the script that we use to create a CRAN-friendly stochtree R package directory, which -creates a `stochtree_cran` subdirectory of the stochtree folder and copies the relevant R package files into this subfolder. -You can run this script by entering `Rscript cran-bootstrap.R 1 1 1` in the terminal in RStudio. -Once you have a `stochtree_cran` subfolder, you can build stochtree using - -```{r} -devtools::install_local("stochtree_cran") -``` - -Since this is a temporary folder, you can clean it up by running `Rscript cran-cleanup.R` in the terminal in RStudio. - -### Python - -Building and making changes to the python library is best done in an isolated virtual environment. There are many different ways of -managing virtual environments in Python, but here we focus on `conda` and `venv`. - -#### Conda - -Conda provides a straightforward experience in managing python dependencies, avoiding version conflicts / ABI issues / etc. - -To build stochtree using a `conda` based workflow, first create and activate a conda environment with the requisite dependencies - -```{bash} -conda create -n stochtree-dev -c conda-forge python=3.10 numpy scipy pytest pandas pybind11 scikit-learn matplotlib seaborn -conda activate stochtree-dev -pip install jupyterlab -``` - -Then install the package by navigating to the stochtree directory and running - -```{bash} -pip install . -``` - -Note that if you are making changes and finding that they aren't reflected after a reinstall of stochtree, you can -clear all of the python package build artifacts with - -```{bash} -rm -rf stochtree.egg-info; rm -rf .pytest_cache; rm -rf build -``` - -and then rerun `pip install .` - -#### Venv - -You could also use venv for environment management. First, navigate to the folder in which you usually store virtual environments -(i.e. `cd /path/to/envs`) and create and activate a virtual environment: - -```{bash} -python -m venv venv -source venv/bin/activate -``` - -Install all of the package (and demo notebook) dependencies - -```{bash} -pip install numpy scipy pytest pandas scikit-learn pybind11 matplotlib seaborn jupyterlab -``` - -Then install the package by navigating to the stochtree directory and running - -```{bash} -pip install . -``` - -Note that if you are making changes and finding that they aren't reflected after a reinstall of stochtree, you can -clear all of the python package development artifacts with - -```{bash} -rm -rf stochtree.egg-info; rm -rf .pytest_cache; rm -rf build -``` - -and then rerun `pip install .` - -### C++ - -#### CMake - -The C++ project can be built independently from the R / Python packages using `cmake`. -See [here](https://cmake.org/install/) for details on installing cmake (alternatively, -on MacOS, `cmake` can be installed using [homebrew](https://formulae.brew.sh/formula/cmake)). -Once `cmake` is installed, you can build the CLI by navigating to the main -project directory at your command line (i.e. `cd /path/to/stochtree`) and -running the following code - -```{bash} -rm -rf build -mkdir build -cmake -S . -B build -cmake --build build -``` - -The CMake build has two primary targets, which are detailed below - -##### Debug Program - -`debug/api_debug.cpp` defines a standalone target that can be straightforwardly run with a debugger (i.e. `lldb`, `gdb`) -while making non-trivial changes to the C++ code. -This debugging program is compiled as part of the CMake build if the `BUILD_DEBUG_TARGETS` option in `CMakeLists.txt` is set to `ON`. - -Once the program has been built, it can be run from the command line via `./build/debugstochtree` or attached to a debugger -via `lldb ./build/debugstochtree` (clang) or `gdb ./build/debugstochtree` (gcc). - -##### Unit Tests - -We test `stochtree` using the [GoogleTest](https://google.github.io/googletest/) framework. -Unit tests are compiled into a single target as part of the CMake build if the `BUILD_TEST` option is set to `ON` -and the test suite can be run after compilation via `./build/teststochtree` - -## Debugging - -Debugging stochtree invariably leads to the "core" C++ codebase, which requires care to debug correctly. -Below we detail how to debug stochtree's C++ core through each of the three interfaces (C++, R and Python). - -### C++ Program - -The `debugstochtree` cmake target exists precisely to quickly debug the C++ core of stochtree. - -First, you must build the program using debug symbols, which you can do by enabling the `USE_DEBUG` optoon -and ensuring that `BUILD_DEBUG_TARGETS` is also switched on, as below - -```{bash} -rm -rf build -mkdir build -cmake -S . -B build -DBUILD_DEBUG_TARGETS=ON -DUSE_DEBUG=ON -cmake --build build -``` - -From here, you can debug at the command line using [lldb](https://lldb.llvm.org/) on MacOS on [gdb](https://sourceware.org/gdb/) on linux by running -either `lldb ./build/debugstochtree` or `gdb ./build/debugstochtree` and using the appropriate lldb / gdb shortcuts to debug your program. - -#### xcode - -While using `gdb` or `lldb` on `debugstochtree` at the command line is very helpful, users may prefer debugging in a full-fledged IDE like xcode (if working in MacOS). -This project's C++ core can be converted to an xcode project from `CMakeLists.txt`, but first you must turn off sanitizers -(xcode seems to have its own way of setting this at build time for different configurations, and having injected -`-fsanitize=address` statically into compiler arguments will cause xcode errors). To do this, modify the `USE_SANITIZER` line in `CMakeLists.txt`: - -``` -option(USE_SANITIZER "Use santizer flags" OFF) -``` - -To generate an XCode project based on the build targets and specifications defined in a `CMakeLists.txt`, navigate to the main project folder (i.e. `cd /path/to/project`) and run the following commands: - -```{bash} -rm -rf xcode/ -mkdir xcode -cd xcode -cmake -G Xcode .. -DCMAKE_C_COMPILER=cc -DCMAKE_CXX_COMPILER=c++ -DUSE_SANITIZER=OFF -DUSE_DEBUG=OFF -cd .. -``` - -Now, if you navigate to the xcode subfolder (in Finder), you should be able to click on a `.xcodeproj` file and the project will open in XCode. - -### R Package - -Debugging stochtree R code requires building the R package with debug symbols. -The simplest way to do this is to open your R installation's `Makevars` file -by running `usethis::edit_r_makevars()` in RStudio which will open `Makevars` -in a code editor. - -If your `Makevars` file already has a line that begins with `CXX17FLAGS = ...`, -look for a `-g -O2` compiler flag and change this to `-g -O0`. If this flag isn't -set in the `CXX17FLAGS = ` line, then simply add `-g -O0` to this line after the ` = `. -If your `Makevars` file already does not have a line that begins with `CXX17FLAGS = ...`, -add `CXX17FLAGS = -g -O0`. - -Now, rebuild the R package as above. Save the R code you'd like to debug to an R script. -Suppose for the sake of illustration that the code you want to debug is saved in -`path/to/debug_script.R`. - -At the command line (either the terminal in RStudio or your local terminal program), -run `R -d lldb` if you are using MacOS (or `R -d gdb` if you are using Linux). - -Now, you'll see an lldb prompt which should look like below with a blinking cursor after it - -``` -(lldb) -``` - -From there, you can set breakpoints, either to specific lines of specific files like `b src/tree.cpp:2117` or to break whenever there is an error using `breakpoint set -E c++`. -(**Note**: in gdb, the breakpoint and control flow commands are slightly different, see [here](https://www.maths.ed.ac.uk/~swood34/RCdebug/RCdebug.html) for more detail on debugging R through `gdb`.) -Now, you can run R through the debugger by typing - -``` -r -``` - -This should load an R console, from which you can execute a script you've set up to run your code using - -```{r} -source("path/to/debug_script.R") -``` - -The code will either stop when it hits your first line-based breakpoint or when it runs into an error if you set the error-based breakpoint. -From there, you can navigate using `lldb` (or `gdb`) commands. - -**Note**: once you've loaded the R console, you can also simply interactively run commands that call stochtree's C++ code (i.e. running the `bart()` or `bcf()` functions). If you're debugging at this level, you probably have a specific problem in mind, and using a repeatable script will be worth your while, but it is not strictly necessary. - -### Python Package - -First, you need to build stochtree's C++ extension with debug symbols. -As always, start by navigating to the stochtree directory (i.e. `cd /path/to/stochtree/`) -and activating your development virtual environment (i.e. `conda activate [env_name]` or `source venv/bin/activate`). - -Since stochtree builds its C++ extension via cmake [following this example](https://github.com/pybind/cmake_example), -you'll need to ensure that the `self.debug` field in the `CMakeBuild` class is set to `True`. -You can do this by setting an environment variable of `DEBUG` equal to 1. -In bash, you can do this with `export DEBUG=1` at the command line. - -Once this is done, build the python library using - -```{bash} -pip install . -``` - -Suppose you'd like to debug stochtree through a script called `/path/to/script.py`. - -First, target a python process with `lldb` (or, alternatively, replace with `gdb` below if you use `gcc` as your compiler) via - -``` -lldb python -``` - -Now, you'll see an lldb (or gdb) prompt which should look like below with a blinking cursor after it - -``` -(lldb) -``` - -From there, you can set breakpoints, either to specific lines of specific files like `b src/tree.cpp:2117` or to break whenever there is an error using `breakpoint set -E c++`. -(If you're using `gdb`, see [here](https://lldb.llvm.org/use/map.html) for a comparison between lldb commands and gdb commands for setting breakpoints and navigating your program.) -Now you can run your python script through the debugger by typing - -``` -r /path/to/script.py -``` - -The program will run until the first breakpoint is hit, and at this point you can navigate using lldb (or gdb) commands. - -**Note**: rather than running a script like `/path/to/script.py` above, you can also simply load the python console by typing `r` at the `(lldb)` terminal and then interactively run commands that call stochtree's C++ code (i.e. sampling `BARTModel` or `BCFModel` objects). If you're debugging at this level, you probably have a specific problem in mind, and using a repeatable script will be worth your while, but it is not strictly necessary. diff --git a/docs/development/index.md b/docs/development/index.md deleted file mode 100644 index 6735d58c9..000000000 --- a/docs/development/index.md +++ /dev/null @@ -1,7 +0,0 @@ -# Development - -`stochtree` is in active development. Here, we detail some aspects of the development process - -* [Contributing](contributing.md): how to get involved with stochtree, by contributing code, documentation, or helpful feedback -* [Adding New Models](new-models.md): how to add a new outcome model in C++ and make it available through the R and Python frontends -* [Roadmap](roadmap.md): timelines for new feature development and releases diff --git a/docs/development/new-models.md b/docs/development/new-models.md deleted file mode 100644 index 15cd281d9..000000000 --- a/docs/development/new-models.md +++ /dev/null @@ -1,271 +0,0 @@ -# Adding New Models to stochtree - -While the process of working with `stochtree`'s codebase to add -functionality or fix bugs is covered in the [contributing](contributing.md) -page, this page discusses a specific type of contribution in detail: -contributing new models (i.e. likelihoods and leaf parameter priors). - -Our C++ core is designed to support any conditionally-conjugate model, but this flexibility requires some explanation in order to be easily modified. - -## Overview - -The key components of `stochtree`'s models are: - -1. A **SuffStat** class that stores and accumulates sufficient statistics -2. A **LeafModel** class that computes marginal likelihoods / posterior parameters and samples leaf node parameters - -Each model implements a different version of these two classes. For example, the "classic" -BART model with constant Gaussian leaves and a Gaussian likelihood is represented by the -`GaussianConstantSuffStat` and `GaussianConstantLeafModel` classes. - -Each class implements a common API, and we use a [factory pattern](https://en.wikipedia.org/wiki/Factory_(object-oriented_programming)) and the C++17 -[std::variant](https://www.cppreference.com/w/cpp/utility/variant.html) -feature to dispatch the correct model at runtime. -Finally, R and Python wrappers expose this flexibility through the BART / BCF interfaces. - -Adding a new leaf model thus requires implementing new `SuffStat` and `LeafModel` -classes, then updating the factory functions and R / Python logic. - -## SuffStat Class - -As a pattern, sufficient statistic classes end in `*SuffStat` and implement several methods: - -* `IncrementSuffStat`: Increment a model's sufficient statistics by one data observation -* `ResetSuffStat`: Reset a model's sufficient statistics to zero / empty -* `AddSuffStat`: Combine two sufficient statistics, storing their sum in the sufficient statistic object that calls this method (without modifying the supplied `SuffStat` objects) -* `SubtractSuffStat`: Same as above but subtracting the second `SuffStat` argument from the first, rather than adding -* `SampleGreaterThan`: Checks whether the current sample size of a `SuffStat` object is greater than some threshold -* `SampleGreaterThanEqual`: Checks whether the current sample size of a `SuffStat` object is greater than or equal to some threshold -* `SampleSize`: Returns the current sample size of a `SuffStat` object - -For the sake of illustration, imagine we are adding a model called `OurNewModel`. The new sufficient statistic class should look something like: - -```cpp -class OurNewModelSuffStat { - public: - data_size_t n; - // Custom sufficient statistics for `OurNewModel` - double stat1; - double stat2; - - OurNewModelSuffStat() { - n = 0; - stat1 = 0.0; - stat2 = 0.0; - } - - void IncrementSuffStat(ForestDataset& dataset, Eigen::VectorXd& outcome, - ForestTracker& tracker, data_size_t row_idx, int tree_idx) { - n += 1; - stat1 += /* accumulate from outcome, dataset, or tracker as needed */; - stat2 += /* accumulate from outcome, dataset, or tracker as needed */; - } - - void ResetSuffStat() { - n = 0; - stat1 = 0.0; - stat2 = 0.0; - } - - void AddSuffStat(OurNewModelSuffStat& lhs, OurNewModelSuffStat& rhs) { - n = lhs.n + rhs.n; - stat1 = lhs.stat1 + rhs.stat1; - stat2 = lhs.stat2 + rhs.stat2; - } - - void SubtractSuffStat(OurNewModelSuffStat& lhs, OurNewModelSuffStat& rhs) { - n = lhs.n - rhs.n; - stat1 = lhs.stat1 - rhs.stat1; - stat2 = lhs.stat2 - rhs.stat2; - } - - bool SampleGreaterThan(data_size_t threshold) { return n > threshold; } - bool SampleGreaterThanEqual(data_size_t threshold) { return n >= threshold; } - data_size_t SampleSize() { return n; } -}; -``` - -## LeafModel Class - -Leaf model classes end in `*LeafModel` and implement several methods: - -* `SplitLogMarginalLikelihood`: the log marginal likelihood of a potential split, as a function of the sufficient statistics for the newly proposed left and right node (i.e. ignoring data points unaffected by a split) -* `NoSplitLogMarginalLikelihood`: the log marginal likelihood of a node without splitting, as a function of the sufficient statistics for that node -* `SampleLeafParameters`: Sample the leaf node parameters for every leaf in a provided tree, according to this model's conditionally conjugate leaf node posterior -* `RequiresBasis`: Whether or not a model requires regressing on "basis functions" in the leaves - -As above, imagine that we are implementing a new model called `OurNewModel`. The new leaf model class should look something like: - -```cpp -class OurNewModelLeafModel { - public: - OurNewModelLeafModel(/* model parameters */) { - // Set model parameters - } - - double SplitLogMarginalLikelihood(OurNewModelSuffStat& left_stat, - OurNewModelSuffStat& right_stat, - double global_variance) { - double left_log_ml = /* calculate left node log ML */; - double right_log_ml = /* calculate right node log ML */; - return left_log_ml + right_log_ml; - } - - double NoSplitLogMarginalLikelihood(OurNewModelSuffStat& suff_stat, - double global_variance) { - double log_ml = /* calculate node log ML */; - return log_ml; - } - - void SampleLeafParameters(ForestDataset& dataset, ForestTracker& tracker, - ColumnVector& residual, Tree* tree, int tree_num, - double global_variance, std::mt19937& gen) { - // Sample parameters for every leaf in a tree, update `tree` directly - } - - inline bool RequiresBasis() { return /* true/false based on your model */; } - - // Helper methods below for `SampleLeafParameters`, which depend on the - // nature of the leaf model (i.e. location-scale, shape-scale, etc...) - - double PosteriorParameterMean(OurNewModelSuffStat& suff_stat, - double global_variance) { - return /* calculate posterior mean */; - } - - double PosteriorParameterVariance(OurNewModelSuffStat& suff_stat, - double global_variance) { - return /* calculate posterior variance */; - } - - private: - // Leaf model parameters - double param1_; - double param2_; -}; -``` - -## Factory Functions - -Updating the factory pattern to be able to dispatch `OurNewModel` has several steps. - -First, we add our model to the `ModelType` enum in `include/stochtree/leaf_model.h`: - -```cpp -enum ModelType { - kConstantLeafGaussian, - kUnivariateRegressionLeafGaussian, - kMultivariateRegressionLeafGaussian, - kLogLinearVariance, - kOurNewModel // New model -}; -``` - -Next, we add the `OurNewModelSuffStat` and `OurNewModelLeafModel` classes to the `std::variant` unions in `include/stochtree/leaf_model.h`: - -```cpp -using SuffStatVariant = std::variant; // New model - -using LeafModelVariant = std::variant; // New model -``` - -Finally, we update the factory functions to dispatch the correct class from the union based on the `ModelType` integer code - -```cpp -static inline SuffStatVariant suffStatFactory(ModelType model_type, int basis_dim = 0) { - if (model_type == kConstantLeafGaussian) { - return createSuffStat(); - } else if (model_type == kUnivariateRegressionLeafGaussian) { - return createSuffStat(); - } else if (model_type == kMultivariateRegressionLeafGaussian) { - return createSuffStat(basis_dim); - } else if (model_type == kLogLinearVariance) { - return createSuffStat(); - } else if (model_type == kOurNewModel) { // New model - return createSuffStat(); - } else { - Log::Fatal("Incompatible model type provided to suff stat factory"); - } -} - -static inline LeafModelVariant leafModelFactory(ModelType model_type, double tau, - Eigen::MatrixXd& Sigma0, double a, double b) { - if (model_type == kConstantLeafGaussian) { - return createLeafModel(tau); - } else if (model_type == kUnivariateRegressionLeafGaussian) { - return createLeafModel(tau); - } else if (model_type == kMultivariateRegressionLeafGaussian) { - return createLeafModel(Sigma0); - } else if (model_type == kLogLinearVariance) { - return createLeafModel(a, b); - } else if (model_type == kOurNewModel) { // New model - return createLeafModel(/* initializer values */); - } else { - Log::Fatal("Incompatible model type provided to leaf model factory"); - } -} -``` - -## R Wrapper - -To reflect this change through to the R interface, we first add the new model to the logic in the `sample_gfr_one_iteration_cpp` -and `sample_mcmc_one_iteration_cpp` functions in the `src/sampler.cpp` file - -```cpp -// Convert leaf model type to enum -StochTree::ModelType model_type; -if (leaf_model_int == 0) model_type = StochTree::ModelType::kConstantLeafGaussian; -else if (leaf_model_int == 1) model_type = StochTree::ModelType::kUnivariateRegressionLeafGaussian; -else if (leaf_model_int == 2) model_type = StochTree::ModelType::kMultivariateRegressionLeafGaussian; -else if (leaf_model_int == 3) model_type = StochTree::ModelType::kLogLinearVariance; -else if (leaf_model_int == 4) model_type = StochTree::ModelType::kOurNewModel; // New model -else StochTree::Log::Fatal("Invalid model type"); -``` - -Then we add the integer code for `OurNewModel` to the `leaf_model_type` field signature in `R/config.R` - -```r -#' @field leaf_model_type Integer specifying the leaf model type (0 = constant leaf, 1 = univariate leaf regression, 2 = multivariate leaf regression, 4 = your new model) -leaf_model_type = NULL, -``` - -## Python Wrapper - -Python's C++ wrapper code contains similar logic to that of the `src/sampler.cpp` file in the R interface. -Add the new model to the `SampleOneIteration` method of the `ForestSamplerCpp` class in the `src/py_stochtree.cpp` file. - -```cpp -// Convert leaf model type to enum -StochTree::ModelType model_type; -if (leaf_model_int == 0) model_type = StochTree::ModelType::kConstantLeafGaussian; -else if (leaf_model_int == 1) model_type = StochTree::ModelType::kUnivariateRegressionLeafGaussian; -else if (leaf_model_int == 2) model_type = StochTree::ModelType::kMultivariateRegressionLeafGaussian; -else if (leaf_model_int == 3) model_type = StochTree::ModelType::kLogLinearVariance; -else if (leaf_model_int == 4) model_type = StochTree::ModelType::kOurNewModel; // New model -else StochTree::Log::Fatal("Invalid model type"); -``` - -And then add the integer code for your new model to the `leaf_model_type` documentation in `stochtree/config.py` - -## Additional Considerations - -Some of the `SuffStat` and `LeafModel` classes currently supported by stochtree require extra initialization parameters. -We support this via [variadic templates](https://en.cppreference.com/w/cpp/language/parameter_pack.html) in C++ - -```cpp -template -static inline void GFRSampleOneIter(TreeEnsemble& active_forest, ForestTracker& tracker, ForestContainer& forests, LeafModel& leaf_model, ForestDataset& dataset, - ColumnVector& residual, TreePrior& tree_prior, std::mt19937& gen, std::vector& variable_weights, - std::vector& sweep_update_indices, double global_variance, std::vector& feature_types, int cutpoint_grid_size, - bool keep_forest, bool pre_initialized, bool backfitting, int num_features_subsample, LeafSuffStatConstructorArgs&... leaf_suff_stat_args) -``` - -If your new classes take any initialization arguments, these are provided in the factory functions, so you might also need to edit the signature of the factory functions. diff --git a/docs/development/roadmap.md b/docs/development/roadmap.md deleted file mode 100644 index d06360b42..000000000 --- a/docs/development/roadmap.md +++ /dev/null @@ -1,21 +0,0 @@ -# Development Roadmap - -We are working hard to make `stochtree` faster, easier to use, and more flexible! Below is a snapshot of our development roadmap. We categorize new product enhancements into four categories: - -1. **User Interface**: the way that a user can build, store, and use models -2. **Performance**: program runtime and memory usage of various models -3. **Modeling Features**: scope of modeling tools provided -4. **Interoperability**: compatibility with other computing and data libraries - -Our development goals are prioritized along three broad timelines - -1. **Now**: development is currently underway or planned for a near-term release -2. **Next**: design / research needed; development hinges on feasibility and time demands -3. **Later**: long-term goal; exploratory - -| Category | Now | Next | Later | -| --- | --- | --- | --- | -| User Interface | | | | -| Performance | | | Hardware acceleration (Apple Silicon GPU)
Hardware acceleration (NVIDIA GPU)
Out-of-memory sampler | -| Modeling Features | Quantile cutpoint sampling
Probit BART and BCF | Monotonicity constraints
Multiclass classification | | -| Interoperability | | | PyMC (Python)
Stan (R / Python)
Apache Arrow (R / Python)
Polars (Python) | diff --git a/docs/getting-started.md b/docs/getting-started.md deleted file mode 100644 index cf4a6742c..000000000 --- a/docs/getting-started.md +++ /dev/null @@ -1,182 +0,0 @@ -# Getting Started - -`stochtree` is composed of a C++ "core" and R / Python interfaces to that core. -Below, we detail how to install the R / Python packages, or work directly with the C++ codebase. - -## R Package - -### CRAN - -The R package can be installed from CRAN via - -``` -install.packages("stochtree") -``` - -### Development Version (Local Build) - -The development version of `stochtree` can be installed from Github via - -``` -remotes::install_github("StochasticTree/stochtree", ref="r-dev") -``` - -## Python Package - -### PyPI - -`stochtree`'s Python package can be installed from PyPI via - -``` -pip install stochtree -``` - -### Development Version (Local Build) - -The development version of `stochtree` can be installed from source using pip's [git interface](https://pip.pypa.io/en/stable/topics/vcs-support/). -To proceed, you will need a working version of [git](https://git-scm.com) and python 3.8 or greater (available from several sources, one of the most -straightforward being the [anaconda](https://docs.conda.io/projects/conda/en/stable/user-guide/install/index.html) suite). - -#### Quick start - -Without worrying about virtual environments (detailed further below), `stochtree` can be installed from the command line - -``` -pip install numpy scipy pytest pandas scikit-learn pybind11 -pip install git+https://github.com/StochasticTree/stochtree.git -``` - -#### Virtual environment installation - -Often, users prefer to manage different projects (with different package / python version requirements) in virtual environments. - -##### Conda - -Conda provides a straightforward experience in managing python dependencies, avoiding version conflicts / ABI issues / etc. - -To build stochtree using a `conda` based workflow, first create and activate a conda environment with the requisite dependencies - -```{bash} -conda create -n stochtree-dev -c conda-forge python=3.10 numpy scipy pytest pandas pybind11 scikit-learn -conda activate stochtree-dev -``` - -Then install the package from github via pip - -```{bash} -pip install git+https://github.com/StochasticTree/stochtree.git -``` - -(*Note*: if you'd like to run `stochtree`'s notebook examples, you will also need `jupyterlab`, `seaborn`, and `matplotlib`) - -```{bash} -conda install matplotlib seaborn -pip install jupyterlab -``` - -With these dependencies installed, you can [clone the repo](#cloning-the-repository) and run the `demo/` examples. - -##### Venv - -You could also use venv for environment management. First, navigate to the folder in which you usually store virtual environments -(i.e. `cd /path/to/envs`) and create and activate a virtual environment: - -```{bash} -python -m venv venv -source venv/bin/activate -``` - -Install all of the package (and demo notebook) dependencies - -```{bash} -pip install numpy scipy pytest pandas scikit-learn pybind11 -``` - -Then install stochtree via - -```{bash} -pip install git+https://github.com/StochasticTree/stochtree.git -``` - -As above, if you'd like to run the notebook examples in the `demo/` subfolder, you will also need `jupyterlab`, `seaborn`, and `matplotlib` and you will have to [clone the repo](#cloning-the-repository) - -```{bash} -pip install matplotlib seaborn jupyterlab -``` - -## C++ Core - -While the C++ core links to both R and Python for a performant, high-level interface, -the C++ code can be compiled and unit-tested and compiled into a standalone -[debug program](https://github.com/StochasticTree/stochtree/tree/main/debug). - -### Compilation - -#### Cloning the Repository - -To clone the repository, you must have git installed, which you can do following [these instructions](https://learn.microsoft.com/en-us/devops/develop/git/install-and-set-up-git). - -Once git is available at the command line, navigate to the folder that will store this project (in bash / zsh, this is done by running `cd` followed by the path to the directory). -Then, clone the `stochtree` repo as a subfolder by running -```{bash} -git clone --recursive https://github.com/StochasticTree/stochtree.git -``` - -*NOTE*: this project incorporates several dependencies as [git submodules](https://git-scm.com/book/en/v2/Git-Tools-Submodules), -which is why the `--recursive` flag is necessary (some systems may perform a recursive clone without this flag, but -`--recursive` ensures this behavior on all platforms). If you have already cloned the repo without the `--recursive` flag, -you can retrieve the submodules recursively by running `git submodule update --init --recursive` in the main repo directory. - -#### CMake Build - -The C++ project can be built independently from the R / Python packages using `cmake`. -See [here](https://cmake.org/install/) for details on installing cmake (alternatively, -on MacOS, `cmake` can be installed using [homebrew](https://formulae.brew.sh/formula/cmake)). -Once `cmake` is installed, you can build the CLI by navigating to the main -project directory at your command line (i.e. `cd /path/to/stochtree`) and -running the following code - -```{bash} -rm -rf build -mkdir build -cmake -S . -B build -cmake --build build -``` - -The CMake build has two primary targets, which are detailed below - -##### Debug Program - -`debug/api_debug.cpp` defines a standalone target that can be straightforwardly run with a debugger (i.e. `lldb`, `gdb`) -while making non-trivial changes to the C++ code. -This debugging program is compiled as part of the CMake build if the `BUILD_DEBUG_TARGETS` option in `CMakeLists.txt` is set to `ON`. - -Once the program has been built, it can be run from the command line via `./build/debugstochtree` or attached to a debugger -via `lldb ./build/debugstochtree` (clang) or `gdb ./build/debugstochtree` (gcc). - -##### Unit Tests - -We test `stochtree` using the [GoogleTest](https://google.github.io/googletest/) framework. -Unit tests are compiled into a single target as part of the CMake build if the `BUILD_TEST` option is set to `ON` -and the test suite can be run after compilation via `./build/teststochtree` - -### Xcode - -While using `gdb` or `lldb` on `debugstochtree` at the command line is very helpful, users may prefer debugging in a full-fledged IDE like xcode. This project's C++ core can be converted to an xcode project from `CMakeLists.txt`, but first you must turn off sanitizers (xcode seems to have its own way of setting this at build time for different configurations, and having injected -`-fsanitize=address` statically into compiler arguments will cause xcode errors). To do this, modify the `USE_SANITIZER` line in `CMakeLists.txt`: - -``` -option(USE_SANITIZER "Use santizer flags" OFF) -``` - -or via command-line argument to `cmake -G Xcode` as shown below. To generate an XCode project based on the build targets and specifications defined in a `CMakeLists.txt` file (and ensure that debug and santizer flags are switched off), navigate to the main project folder (i.e. `cd /path/to/project`) and run the following commands: - -```{bash} -rm -rf xcode/ -mkdir xcode -cd xcode -cmake -G Xcode .. -DCMAKE_C_COMPILER=cc -DCMAKE_CXX_COMPILER=c++ -DUSE_SANITIZER=OFF -DUSE_DEBUG=OFF -cd .. -``` - -Now, if you navigate to the xcode subfolder (in Finder), you should be able to click on a `.xcodeproj` file and the project will open in XCode. diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index 50fb41ecf..000000000 --- a/docs/index.md +++ /dev/null @@ -1,53 +0,0 @@ -# StochTree - -`stochtree` (short for "stochastic trees") unlocks flexible decision tree modeling in R or Python. - -## Table of Contents - -* [Getting Started](getting-started.md): Details on how to install and use `stochtree` -* [About](about.md): Overview of the models supported by stochtree and pointers to further reading -* [R Package](R_docs/index.md): Complete documentation of the R package -* [Python Package](python_docs/index.md): Complete documentation of the Python package -* [C++ Core API and Architecture](cpp_docs/index.md): Overview and documentation of the C++ codebase that supports stochtree -* [Advanced Vignettes](vignettes/index.md): In-depth tutorials on new methods implemented using stochtree -* [Development](development/index.md): Roadmap and how to contribute - -## What does the software do? - -Boosted decision tree models (like [xgboost](https://xgboost.readthedocs.io/en/stable/), -[LightGBM](https://lightgbm.readthedocs.io/en/latest/), or -[scikit-learn's HistGradientBoostingRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html)) -are great, but often require time-consuming hyperparameter tuning. -`stochtree` can help you avoid this, by running a fast Bayesian analog of gradient boosting (called BART -- Bayesian Additive Regression Trees). - -`stochtree` has two primary interfaces: - -1. "High-level": robust implementations of many popular stochastic tree algorithms (BART, XBART, BCF, XBCF), with support for serialization and parallelism. -2. "Low-level": access to the "inner loop" of a forest sampler, allowing custom tree algorithm development in <50 lines of code. - -The "core" of the software is written in C++, but it provides R and Python APIs. -The R package is [available on CRAN](https://cran.r-project.org/web/packages/stochtree/index.html) and the python package will soon be on PyPI. - -## Why "stochastic" trees? - -"Stochastic" loosely means the same thing as "random." This naturally raises the question: how is `stochtree` different from a random forest library? -At a superficial level, both are decision tree ensembles that use randomness in training. - -The difference lies in how that "randomness" is deployed. -Random forests take random subsets of a training dataset, and then run a deterministic decision tree fitting algorithm ([recursive partitioning](https://en.wikipedia.org/wiki/Recursive_partitioning)). -Stochastic tree algorithms use randomness to construct decision tree ensembles from a fixed training dataset. - -The original stochastic tree model, [Bayesian Additive Regression Trees (BART)](https://projecteuclid.org/journals/annals-of-applied-statistics/volume-4/issue-1/BART-Bayesian-additive-regression-trees/10.1214/09-AOAS285.full), used [Markov Chain Monte Carlo (MCMC)](https://en.wikipedia.org/wiki/Markov_chain_Monte_Carlo) to sample forests from their posterior distribution. - -So why not call our project `bayesiantree`? - -Some algorithms implemented in `stochtree` are "quasi-Bayesian" in that they are inspired by a Bayesian model, but are sampled with fast algorithms that do not provide a valid Bayesian posterior distribution. - -Moreover, we think of stochastic forests as general-purpose modeling tools. -What makes them useful is their strong empirical performance -- especially on small or noisy datasets -- not their adherence to any statistical framework. - -So why not just call our project `decisiontree`? - -Put simply, the sampling approach is part of what makes BART and other `stochtree` algorithms work so well -- we know because we have tested out versions that did not do stochastic sampling of the tree fits. - -So we settled on the term "stochastic trees", or "stochtree" for short (pronounced "stoke-tree"). diff --git a/docs/javascripts/mathjax.js b/docs/javascripts/mathjax.js deleted file mode 100644 index 3d0d92528..000000000 --- a/docs/javascripts/mathjax.js +++ /dev/null @@ -1,19 +0,0 @@ -window.MathJax = { - tex: { - inlineMath: [["\\(", "\\)"]], - displayMath: [["\\[", "\\]"]], - processEscapes: true, - processEnvironments: true - }, - options: { - ignoreHtmlClass: ".*|", - processHtmlClass: "arithmatex" - } - }; - - document$.subscribe(() => { - MathJax.startup.output.clearCache() - MathJax.typesetClear() - MathJax.texReset() - MathJax.typesetPromise() - }) \ No newline at end of file diff --git a/docs/python_docs/api/bart.md b/docs/python_docs/api/bart.md deleted file mode 100644 index 1d9a835e5..000000000 --- a/docs/python_docs/api/bart.md +++ /dev/null @@ -1,6 +0,0 @@ -# BART - -::: stochtree.bart.BARTModel - options: - show_source: false - members_order: source diff --git a/docs/python_docs/api/bcf.md b/docs/python_docs/api/bcf.md deleted file mode 100644 index a5e1b305b..000000000 --- a/docs/python_docs/api/bcf.md +++ /dev/null @@ -1,6 +0,0 @@ -# BCF - -::: stochtree.bcf.BCFModel - options: - show_source: false - members_order: source diff --git a/docs/python_docs/api/index.md b/docs/python_docs/api/index.md deleted file mode 100644 index 819aa75ee..000000000 --- a/docs/python_docs/api/index.md +++ /dev/null @@ -1,13 +0,0 @@ -# StochTree Python API Reference - -The `stochtree` interface is divided into two "levels": - -1. "High level": end-to-end implementations of stochastic tree ensembles for supervised learning (BART / XBART) and causal inference (BCF / XBCF). Both interfaces are designed to mirror the [scikit-learn estimator style](https://scikit-learn.org/dev/developers/develop.html), with the `.fit()` method replaced by a `.sample()` method. We also provide a scikit-learn-compatible BART estimator which treats the posterior mean forest predictions as a single supervised learning model and enables cross-validation / model selection through `sklearn`. - 1. The BART (supervised learning) interface is documented [here](bart.md). - 2. The BCF (causal inference) interface is documented [here](bcf.md). - 3. The scikit-learn BART estimator is documented [here](sklearn.md). -2. "Low level": we provide access to most of the C++ sampling objects and functionality via Python, which allow for custom sampling algorithms and integration of other model terms. This interface is documented [here](low-level/index.md) and consists broadly of the following components: - 1. [Data API](low-level/dataset.md): loading and storing in-memory data needed to train `stochtree` models. - 2. [Forest API](low-level/forest.md): creating, storing, and modifying ensembles of decision trees that underlie all `stochtree` models. - 3. [Sampler API](low-level/sampler.md): sampling from stochastic tree ensemble models as well as several supported parametric models. - 4. [Utilities API](low-level/utilities.md): seeding a C++ random number generator, preprocessing data, and serializing models to JSON (files or in-memory strings). diff --git a/docs/python_docs/api/low-level/dataset.md b/docs/python_docs/api/low-level/dataset.md deleted file mode 100644 index b1c9e1e68..000000000 --- a/docs/python_docs/api/low-level/dataset.md +++ /dev/null @@ -1,14 +0,0 @@ -# Data API - -::: stochtree.data.Dataset - options: - show_source: false - members_order: source - merge_init_into_class: true - -::: stochtree.data.Residual - options: - show_source: false - members_order: source - merge_init_into_class: true - diff --git a/docs/python_docs/api/low-level/forest.md b/docs/python_docs/api/low-level/forest.md deleted file mode 100644 index fc73e102e..000000000 --- a/docs/python_docs/api/low-level/forest.md +++ /dev/null @@ -1,11 +0,0 @@ -# Forest API - -::: stochtree.forest.Forest - options: - show_source: false - members_order: source - -::: stochtree.forest.ForestContainer - options: - show_source: false - members_order: source diff --git a/docs/python_docs/api/low-level/index.md b/docs/python_docs/api/low-level/index.md deleted file mode 100644 index 56d05ee44..000000000 --- a/docs/python_docs/api/low-level/index.md +++ /dev/null @@ -1,11 +0,0 @@ -# Low-Level API - -In addition to high-level samplers for [BART](../bart.md) and [BCF](../bcf.md), -the `stochtree` Python library provides direct access to many of the computational structures that -underlie stochastic tree algorithms: tree ensembles, sampling algorithms, and "tracking" data structures -that enable the algorithms to work effectively. This interface consists of: - -1. [Data API](dataset.md): loading and storing in-memory data needed to train `stochtree` models. -2. [Forest API](forest.md): creating, storing, and modifying ensembles of decision trees that underlie all `stochtree` models. -3. [Sampler API](sampler.md): sampling from stochastic tree ensemble models as well as several supported parametric models. -4. [Utilities API](utilities.md): seeding a C++ random number generator, preprocessing data, and serializing models to JSON (files or in-memory strings). diff --git a/docs/python_docs/api/low-level/sampler.md b/docs/python_docs/api/low-level/sampler.md deleted file mode 100644 index 797bdb899..000000000 --- a/docs/python_docs/api/low-level/sampler.md +++ /dev/null @@ -1,16 +0,0 @@ -# Sampler API - -::: stochtree.sampler.ForestSampler - options: - show_source: false - members_order: source - -::: stochtree.sampler.GlobalVarianceModel - options: - show_source: false - members_order: source - -::: stochtree.sampler.LeafVarianceModel - options: - show_source: false - members_order: source diff --git a/docs/python_docs/api/low-level/utilities.md b/docs/python_docs/api/low-level/utilities.md deleted file mode 100644 index 580b8001c..000000000 --- a/docs/python_docs/api/low-level/utilities.md +++ /dev/null @@ -1,16 +0,0 @@ -# Utilies API - -::: stochtree.sampler.RNG - options: - show_source: false - members_order: source - -::: stochtree.preprocessing.CovariatePreprocessor - options: - show_source: false - members_order: source - -::: stochtree.serialization.JSONSerializer - options: - show_source: false - members_order: source diff --git a/docs/python_docs/api/sklearn.md b/docs/python_docs/api/sklearn.md deleted file mode 100644 index e889d66eb..000000000 --- a/docs/python_docs/api/sklearn.md +++ /dev/null @@ -1,11 +0,0 @@ -# `stochtree` Models Wrapped as `sklearn` Estimators - -::: stochtree.StochTreeBARTRegressor - options: - show_source: false - members_order: source - -::: stochtree.StochTreeBARTBinaryClassifier - options: - show_source: false - members_order: source diff --git a/docs/python_docs/demo/index.md b/docs/python_docs/demo/index.md deleted file mode 100644 index 888ada454..000000000 --- a/docs/python_docs/demo/index.md +++ /dev/null @@ -1,12 +0,0 @@ -# StochTree Python API Demo - -The following demos showcase (some of) the functionality and output of the `stochtree` python package. - -1. [Supervised Learning](supervised_learning.ipynb): using `BARTModel()` for classic supervised learning tasks -2. [Causal Inference](causal_inference.ipynb): using `BCFModel()` for causal effect estimation -3. [Heteroskedastic Supervised Learning](heteroskedastic_supervised_learning.ipynb): using `BARTModel()` for supervised learning tasks with heteroskedasticity (covariate-dependent variance) -4. [Multivariate Treatment Causal Inference](multivariate_treatment_causal_inference.ipynb): using `BCFModel()` for causal effect estimation with a multivariate (continuous) treatment variable -5. [Model Serialization](serialization.ipynb): saving and reloading `stochtree` models via JSON -6. [Internal Tree Inspection](tree_inspection.ipynb): inspecting the trees in a sampled `stochtree` forest -7. [Scikit-Learn Estimator Wrappers](sklearn_wrappers.ipynb): wrappers that enable `stochtree.BARTModel` to be used as a `sklearn` estimator (for e.g. cross-validation or model selection) -8. [Low-Level Interface](prototype_interface.ipynb): using the low-level `stochtree` interface to construct a custom sampling loop diff --git a/docs/python_docs/index.md b/docs/python_docs/index.md deleted file mode 100644 index 192baf819..000000000 --- a/docs/python_docs/index.md +++ /dev/null @@ -1,6 +0,0 @@ -# StochTree Python Library - -Our documentation of the `stochtree` python library has two components: - -1. [API Documentation](api/index.md): in-depth documentation of the classes and functions and govern the `stochtree` python API -2. [Demos](demo/index.md): notebook-style vignettes that showcase the functionality, output, and use cases of `stochtree` in python diff --git a/docs/vignettes/Python/.gitkeep b/docs/vignettes/Python/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/docs/vignettes/R/.gitkeep b/docs/vignettes/R/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/docs/vignettes/index.md b/docs/vignettes/index.md deleted file mode 100644 index cca7a14e5..000000000 --- a/docs/vignettes/index.md +++ /dev/null @@ -1,12 +0,0 @@ -# Advanced StochTree Vignettes - -While the R and Python package documentation contains ample documentation and examples, -`stochtree`'s origin in the world of academic decision tree research means it is -actively used to develop novel algorithms and applications. -This section includes in-depth tutorials on how to implement new methods using stochtree, -with an R and Python version for each vignette. - -Current advanced vignettes include: - -1. Using `stochtree` for Regression Discontinuity Design ([R](R/rdd.html), [Python](Python/rdd.html)) -2. Using `stochtree` for Instrumental Variables Analysis ([R](R/iv.html), [Python](Python/iv.html)) diff --git a/mkdocs.yml b/mkdocs.yml deleted file mode 100644 index a14aab0b7..000000000 --- a/mkdocs.yml +++ /dev/null @@ -1,140 +0,0 @@ -site_name: StochTree -site_url: "https://stochtree.ai/" -site_description: "Stochastic tree ensembles (i.e. BART, XBART) for supervised learning and causal inference" -repo_url: "https://github.com/StochasticTree/stochtree" -repo_name: "StochasticTree/stochtree" -theme: - name: material - features: - - navigation.expand - - navigation.indexes - - navigation.footer - - navigation.path - - navigation.sections - - navigation.tabs - - navigation.tabs.sticky - - search.suggest - - search.highlight - - search.share - - toc.follow - - toc.integrate - palette: - - media: "(prefers-color-scheme)" - toggle: - icon: material/link - name: Switch to light mode - - media: "(prefers-color-scheme: light)" - scheme: default - primary: blue grey - accent: blue grey - toggle: - icon: material/toggle-switch - name: Switch to dark mode - - media: "(prefers-color-scheme: dark)" - scheme: slate - primary: black - accent: indigo - toggle: - icon: material/toggle-switch-off - name: Switch to system preference -nav: - - Home: index.md - - 'Getting Started': getting-started.md - - 'About StochTree': about.md - - 'R Package': - - 'R Package': R_docs/index.md - - 'Pkgdown Site': 'R_docs/pkgdown/index.html' - - 'Python Package': - - 'Python Package': python_docs/index.md - - 'API': - - 'API': python_docs/api/index.md - - 'BART': python_docs/api/bart.md - - 'BCF': python_docs/api/bcf.md - - 'Scikit-Learn Interface': python_docs/api/sklearn.md - - 'Low level interface': - - 'Low level interface': python_docs/api/low-level/index.md - - 'Data API': python_docs/api/low-level/dataset.md - - 'Forest API': python_docs/api/low-level/forest.md - - 'Sampler API': python_docs/api/low-level/sampler.md - - 'Utilies API': python_docs/api/low-level/utilities.md - - 'Demo': - - 'Demo': python_docs/demo/index.md - - 'BART': python_docs/demo/supervised_learning.ipynb - - 'BCF': python_docs/demo/causal_inference.ipynb - - 'Ordinal Outcome Modeling': python_docs/demo/ordinal_outcome.ipynb - - 'Multi-Chain Inference': python_docs/demo/multi_chain.ipynb - - 'Heteroskedastic BART': python_docs/demo/heteroskedastic_supervised_learning.ipynb - - 'Multivariate treatment BCF': python_docs/demo/multivariate_treatment_causal_inference.ipynb - - 'Semiparametric BCF': python_docs/demo/reparameterized_causal_inference.ipynb - - 'Model Serialization': python_docs/demo/serialization.ipynb - - 'Summary and Plotting Utilities': python_docs/demo/summary.ipynb - - 'Internal Tree Inspection': python_docs/demo/tree_inspection.ipynb - - 'Scikit-Learn API': python_docs/demo/sklearn_wrappers.ipynb - - 'Low-Level Interface': python_docs/demo/prototype_interface.ipynb - - 'C++ Core API and Architecture': - - 'C++ Core API and Architecture': cpp_docs/index.md - - 'Tracking Data Structures': cpp_docs/tracking.md - - 'C++ Doxygen Site': 'cpp_docs/doxygen/index.html' - - 'Vignettes': - - 'Vignettes': vignettes/index.md - - 'R': - - 'Instrumental Variables': 'vignettes/R/iv.html' - - 'RDD': 'vignettes/R/rdd.html' - - 'Python': - - 'Instrumental Variables': 'vignettes/Python/iv.html' - - 'RDD': 'vignettes/Python/rdd.html' - - 'Development': - - 'Development': development/index.md - - 'Contributing': development/contributing.md - - 'Adding New Models': development/new-models.md - - 'Roadmap': development/roadmap.md -extra: - social: - - icon: fontawesome/brands/github - link: https://github.com/StochasticTree/stochtree - name: stochtree on Github - generator: false -copyright: Copyright © 2023 - 2025 Drew Herren, Carlos Carvalho, Richard Hahn, Jared Murray -markdown_extensions: - - tables - - toc: - permalink: "#" - toc_depth: 3 - baselevel: 1 - - pymdownx.tabbed: - alternate_style: true - - pymdownx.highlight: - anchor_linenums: true - line_spans: __span - pygments_lang_class: true - - pymdownx.inlinehilite - - pymdownx.snippets - - pymdownx.arithmatex: - generic: true - - pymdownx.superfences: - custom_fences: - - name: mermaid - class: mermaid - format: !!python/name:pymdownx.superfences.fence_code_format -plugins: - - offline - - search - - mkdocstrings: - handlers: - python: - options: - docstring_style: numpy - show_signature: true - line_length: 60 - heading_level: 2 - show_root_heading: true - docstring_options: - ignore_init_summary: true - - mkdocs-jupyter: - execute: true - include_source: True - include_requirejs: true - custom_mathjax_url: "https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/latest.js?config=TeX-AMS_CHTML-full,Safe" -extra_javascript: - - javascripts/mathjax.js - - https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js