In [11]:
!pip uninstall treefarms

[0mFound existing installation: treefarms 0.2.3
Uninstalling treefarms-0.2.3:
  Would remove:
    /home/users/vb97/.local/lib/python3.10/site-packages/treefarms-0.2.3.dist-info/*
    /home/users/vb97/.local/lib/python3.10/site-packages/treefarms/*
Proceed (Y/n)? ^C
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [None]:
import pandas as pd
import numpy as np
import pathlib
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from treefarms.model.threshold_guess import compute_thresholds, cut
from treefarms import TREEFARMS
from treefarms.model.model_set import ModelSetContainer

# Example

In this example, we run TREEFARMS on COMPAS, a recidivism dataset. The COMPAS dataset contains 6907 samples and 7 continuous features. We visualize the Rashomon set using `timbertrek` package, as well as show the way to obtain individual trees from the Rashomon set.


In [2]:
# read the dataset
df = pd.read_csv("../experiments/datasets/compas/binned.csv")
X, y = df.iloc[:, :-1], df.iloc[:, -1]
h = df.columns[:-1]
df


Unnamed: 0,sex:Female,age:<21,age:<23,age:<26,age:<46,juvenile-felonies:=0,juvenile-misdemeanors:=0,juvenile-crimes:=0,priors:=0,priors:=1,priors:2-3,priors:>3,recidivate-within-two-years:1
0,0,0,0,0,0,1,1,1,1,0,0,0,0
1,0,0,0,0,1,1,1,1,1,0,0,0,1
2,0,0,1,1,1,1,1,0,0,0,0,1,1
3,0,0,0,0,1,1,1,1,1,0,0,0,0
4,0,0,0,0,1,1,1,1,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6902,0,0,1,1,1,1,1,1,1,0,0,0,0
6903,0,0,1,1,1,1,1,1,1,0,0,0,0
6904,0,0,0,0,0,1,1,1,1,0,0,0,0
6905,1,0,0,0,1,1,1,1,0,0,1,0,0


We fit the Rashomon set on the COMPAS dataset.


In [9]:
# train TREEFARMS model
config = {
    "regularization": 0.01,  # regularization penalizes the tree with more leaves. We recommend to set it to relative high value to find a sparse tree.
    "rashomon_bound_multiplier": 0.05,
    "cart_lookahead_depth": 2, # rashomon bound multiplier indicates how large of a Rashomon set would you like to get
    "depth_budget": 5
}

model = TREEFARMS(config)

model.fit(X, y)


null
Finding Optimal Objective...
{
  "false": {
    "false": {
      "complexity": 0.009999999776482582,
      "loss": 0.021861879155039787,
      "name": "recidivate-within-two-years:1",
      "prediction": 1
    },
    "feature": 7,
    "name": "juvenile-crimes:=0",
    "reference": 1,
    "relation": "==",
    "true": {
      "complexity": 0.009999999776482582,
      "loss": 0.21644708514213562,
      "name": "recidivate-within-two-years:1",
      "prediction": 0
    },
    "type": "integral"
  },
  "feature": 11,
  "model_objective": 0.3748675286769867,
  "name": "priors:>3",
  "reference": 1,
  "relation": "==",
  "true": {
    "complexity": 0.009999999776482582,
    "loss": 0.10655856132507324,
    "name": "recidivate-within-two-years:1",
    "prediction": 1
  },
  "type": "integral"
}
{
  "false": {
    "complexity": 0.009999999776482582,
    "loss": 0.03938033804297447,
    "name": "recidivate-within-two-years:1",
    "prediction": 1
  },
  "feature": 7,
  "model_objective": 0

<treefarms.model.treefarms.TREEFARMS at 0x7f6a5682ec20>

We then visualize the Rashomon set. 

In [None]:
# # TREEFARMS will attempt to obtain feature names from the DataFrame columns.
# # However, it is also possible to manually set this value, such as the
# # commented code snippet below

# feature_names = df.columns

# feature_description = {
#     "sex": {"info": "Sex", "type": "is", "short": "Sex"},
#     "age": {"info": "Age", "type": "count", "short": "Age"},
#     "juvenile-felonies": {
#         "info": "Number of juvenile felonies",
#         "type": "count",
#         "short": "Juv felony",
#     },
#     "juvenile-misdemeanors": {
#         "info": "Number of juvenile misdemeanors",
#         "type": "count",
#         "short": "Juv misdemeanor",
#     },
#     "juvenile-crimes": {
#         "info": "Number of juvenile crimes",
#         "type": "count",
#         "short": "Juv crime",
#     },
#     "priors": {
#         "info": "Number of prior crimes",
#         "type": "count",
#         "short": "Prior crime",
#     },
#     "recidivate-within-two-years": {
#         "info": "Has recidivated within two years",
#         "type": "yes",
#         "short": "Recidivated",
#     },
# }
# model.visualize(feature_names, feature_description)

model.visualize()


It is also possible to obtain individual trees from the Rashomon set. The following cell demonstrates getting the accuracy of the first tree in the Rashomon set as well as printing out its structure.

In [None]:
first_tree = model[0]
print(f'The accuracy of the first tree on the data is: {first_tree.score(X, y)}')
print(model[0])

Thank you for reading our tutorial. Please do try out our methods with different parameters and datasets. Happy tree farming!
