In [2]:
%load_ext autoreload
%autoreload 2

from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pickle as pkl
from os.path import join as oj
from copy import deepcopy
import pandas as pd
from numpy import array as arr
import time, sys, os
sys.path.append('../src')
import utils

# sklearn models
from sklearn.calibration import calibration_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import export_graphviz, DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.tree import plot_tree

import torch
from torch import nn

cred = (234/255, 51/255, 86/255)
cblue = (57/255, 138/255, 242/255)
out_dir = '../results/sim_linear'
os.makedirs(out_dir, exist_ok=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# rulefit
- must be installed with: `pip install git+git://github.com/christophM/rulefit.git`
- some docs: `https://github.com/christophM/rulefit`
- original paper: `http://statweb.stanford.edu/~jhf/ftp/RuleFit.pdf`

In [None]:
import numpy as np
import pandas as pd

from rulefit import RuleFit

boston_data = pd.read_csv("data/boston.csv", index_col=0)

y = boston_data.medv.values
X = boston_data.drop("medv", axis=1)
features = X.columns
X = X.values

rf = RuleFit()
rf.fit(X, y, feature_names=features)

preds = rf.predict(X)
print(f'train mse: {np.mean(np.square(preds-y)):0.2f}')

*inspect the rules*
- rule is how the feature is constructed
- coef is its weight in the final linear model
- support is how many points it applies to

In [None]:
rules = rf.get_rules()

rules = rules[rules.coef != 0].sort_values("support", ascending=False)

print(rules[['rule', 'coef', 'support']]) #.sort_values('coef'))

# scalable bayesian rule lists
- docs at `https://github.com/myaooo/pysbrl`
- requires installing some `c` packages (ex. instructions [here](https://coral.ise.lehigh.edu/jild13/2016/07/11/hello/))
- note: import format is pretty strange (it is sensible in [R](https://rdrr.io/cran/sbrl/man/sbrl.html))

In [25]:
# this example has categorical variables c1, c2, ... that take on values 'o', 'x', 'b'

import pysbrl

rule_ids, outputs, rule_strings = pysbrl.train_sbrl("data/train.out", 
                                                    "data/train.label", 
                                                    20.0, eta=2.0, max_iters=2000)

class_num = 1
for i, rule_id in enumerate(rule_ids):
    prefix = 'if' if i == 0 else 'elif' if i < len(rule_ids) -1 else ''
    print(prefix, rule_strings[rule_id] + f': {outputs[i, class_num]:0.2f}')

if {c4=o,c5=o,c6=o}: 0.04
elif {c1=x,c7=x}: 0.86
elif {c5=x,c7=o,c9=o}: 0.58
elif {c1=x,c2=x,c3=x}: 0.98
elif {c5=o,c9=o}: 0.02
elif {c1=o,c2=o,c3=o}: 0.04
elif {c3=x,c4=b,c9=x}: 0.93
elif {c7=o,c9=o}: 0.08
elif {c1=x,c2=b,c3=x}: 0.88
elif {c5=o,c7=o}: 0.17
elif {c6=o,c9=o}: 0.43
elif {c4=x,c5=o}: 0.33
elif {c1=o,c4=o,c7=o}: 0.05
 default: 0.98


# optimal classification tree
- docs here: `https://github.com/pan5431333/pyoptree`
- note: this implementation is still unstable

In [None]:
import pandas as pd
from pyoptree.optree import OptimalHyperTreeModel, OptimalTreeModel

data = pd.DataFrame({
        "index": ['A', 'C', 'D', 'E', 'F'],
        "x1": [1, 2, 2, 2, 3],
        "x2": [1, 2, 1, 0, 1],
        "y": [1, 1, -1, -1, -1]
    })
test_data = pd.DataFrame({
    "index": ['A', 'B', 'C', 'D', 'E', 'F', 'G'],
    "x1": [1, 1, 2, 2, 2, 3, 3],
    "x2": [1, 2, 2, 1, 0, 1, 0],
    "y": [1, 1, 1, -1, -1, -1, -1]
})
model = OptimalHyperTreeModel(["x1", "x2"], "y", tree_depth=2, N_min=1, alpha=0.1, solver_name="cplex")
model.train(data)

print(model.predict(test_data))