## Test class for running GOSDT implementation

In [17]:
import sys
sys.path.append("..")
import preprocessing.Datasets as DS
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
import gosdt
import json
from gosdt.model.threshold_guess import compute_thresholds

# Load dataset
problem = DS.banknote 
pname = problem.__name__.upper()
print(f'---{pname}---')
df = problem('../../datasets/original/')
X = df.drop('y', axis=1)
y = df['y']

---BANKNOTE---


### Testing implementation

In [21]:
# Setting up GOSDT configuration
labelpath = "labels.csv"
config = {
                "regularization": 0.05,
                "depth_budget":5*1,  # +1 to account for root node
                "reference_LB": True,
                "verbose": True,
                "diagnostics": True,
                "path_to_labels": labelpath,
                "time_limit": 600
        }

In [22]:
# Train & predict GOSDT model

print("Generating warm labels...")
clf = GradientBoostingClassifier(
    n_estimators=40, max_depth=5, random_state=42)
clf.fit(X_train, y_train.values.flatten())
warm_labels = clf.predict(X_train)
pd.DataFrame(warm_labels, columns=["class_labels"]).to_csv(
   labelpath, header="class_labels", index=None)


n_est = 40
max_depth_warm_labels = max(1, math.ceil(
    math.log2(len(np.unique(y))))-1)
X, thresholds, header, threshold_guess_time=compute_thresholds(
    X, y, n_est, max_depth_warm_labels)
y=pd.DataFrame(y)

X_train, X_test, y_train, y_test=train_test_split(
    X, y, test_size=0.2, random_state=42)

model = gosdt.GOSDT(config)
model.fit(X_train, y_train)
json_tree = json.loads(model.json())

y_pred = model.predict(X_test)

Generating warm labels...
gosdt reported successful execution
training completed. 0.000/0.000/0.003 (user, system, wall), mem=0 MB
bounds: [0.235825..0.235825] (0.000000) loss=0.135825, iterations=86


### Extracting metrics

In [25]:
# Helper function to extract features from the tree using the json representation
def extract_features_from_name(d):
    features = []

    def recurse_extract(d):
        if isinstance(d, dict):
            if 'name' in d and '<=' in d['name']:
                feature_name = d['name'].split('<=')[0].strip()
                features.append(feature_name)
            for key in d:
                recurse_extract(d[key])

    recurse_extract(d)
    return features

# Function to recursively find the depth of "prediction" keys, alculate the average depth
def find_prediction_depth(d, current_depth=0, depths=None):
    if depths is None:
        depths = []
    if isinstance(d, dict):
        for key, value in d.items():
            if key == "prediction":
                depths.append(current_depth+1)
            else:
                find_prediction_depth(value, current_depth + 1, depths)
    elif isinstance(d, list):
        for item in d:
            find_prediction_depth(item, current_depth + 1, depths)
    return depths

In [27]:
# Using custom and build-in functions to extract metrics
nodes = model.tree.source
num_nodes = model.nodes()
num_leaves = model.leaves()
max_depth = model.max_depth()-1

depths = find_prediction_depth(json_tree)
features_used = len(set(extract_features_from_name(json_tree)))
avg_depth = (sum(depths) / len(depths))-1
is_imbalanced_tree = (avg_depth!=max_depth)

print(f"Number of nodes: {num_nodes}")
print(f"Number of leaves: {num_leaves}")
print(f"Maximum depth: {max_depth}")
print(f"Average depth: {avg_depth}")
print(f"Imbalanced tree: {is_imbalanced_tree}")
print(f"Features used: {features_used}")

Number of nodes: 3
Number of leaves: 2
Maximum depth: 1
Average depth: 1.0
Imbalanced tree: False
Features used: 1
