In [1]:
import settree
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

# PARAMS
SET_SIZE = 10
ITEM_DIM = 2
N_TRAIN = 1000
N_TEST = 1000
SEED = 0

## Create dataset
Create synthetic dataset of 2D point, following exp.1 in the paper. This dataset is comprised from sets of 2D points,
a positive set contains a single point from the first quadrant. A negative set is not containing points fromt the first quadrant.x

We also configure a SetDataset object to be used in conjunction with SetTree. This object stored the sets in a conviniate way.

In [2]:
x_train, y_train = settree.get_first_quarter_data(N_TRAIN, min_items_set=SET_SIZE, max_items_set=SET_SIZE+1, dim=ITEM_DIM)
x_test, y_test = settree.get_first_quarter_data(N_TEST, min_items_set=SET_SIZE, max_items_set=SET_SIZE+1, dim=ITEM_DIM)
ds_train = settree.SetDataset(records=x_train, is_init=True)
ds_test = settree.SetDataset(records=x_test, is_init=True)

Configure the desired operation for SetTree, print the default operations list

In [3]:
list_of_operations = settree.OPERATIONS
print(settree.OPERATIONS)

[Op (min), Op (max), Op (sum), Op (mean), Op (sec_mom_mean), Op (harm_mean), Op (geo_mean)]


## Configure and train Set-Tree model

In [4]:
ATTN_SET_LIMIT = 3
USE_ATTN_SET = True
USE_ATTN_SET_COMP = True
MAX_DEPTH = 6

set_tree_model = settree.SetTree(classifier=True,
                                 criterion='entropy',
                                 splitter='sklearn',
                                 max_features=None,
                                 min_samples_split=2,
                                 operations=list_of_operations,
                                 use_attention_set=USE_ATTN_SET,
                                 use_attention_set_comp=USE_ATTN_SET_COMP,
                                 attention_set_limit=ATTN_SET_LIMIT,
                                 max_depth=MAX_DEPTH,
                                 min_samples_leaf=None,
                                 random_state=SEED)
set_tree_model.fit(ds_train, y_train)
set_tree_test_acc = (set_tree_model.predict(ds_test) == y_test).mean()
print('Set-Tree: Test accuracy: {:.4f}'.format(set_tree_test_acc))

Started training!
Finished training!
Set-Tree: Test accuracy: 0.9980


## Configure and train regular Tree model

In [5]:
x_train_flat, x_test_flat = settree.flatten_datasets(ds_train, ds_test, list_of_operations)
tree_model = DecisionTreeClassifier(criterion="gini",
                                    splitter="best",
                                    max_depth=MAX_DEPTH,
                                    min_samples_split=2,
                                    min_samples_leaf=1,
                                    min_weight_fraction_leaf=0.,
                                    max_features=None,
                                    random_state=SEED)

tree_model.fit(x_train_flat, y_train)
tree_test_acc = (tree_model.predict(x_test_flat) == y_test).mean()
print('Regular Tree: Test accuracy: {:.4f}'.format(tree_test_acc))

Tree: Test accuracy: 0.6050
