In [5]:
# o código do gosdt usa várias funções antigas do sklearn,
# gerando muitas notificações; vamos desligar os warnings
# para 'limpar' a saída dessas mensagens
import warnings
warnings.filterwarnings('ignore')

In [6]:
import pandas as pd
import numpy as np
import pathlib
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
# Alteração - correção dos nomes de pacotes
from gosdt.model.threshold_guess import compute_thresholds, cut
from gosdt import GOSDT
import joblib
import random

In [7]:
# Fixando seed para replicabilidade
random_state = 42
np.random.seed(random_state)
random.seed(random_state)

In [8]:
# read the dataset
df = pd.read_csv("../data/salary/processed-salary.csv")
h = df.columns[:-1]
h

Index(['workclass_?', 'workclass_Federal-gov', 'workclass_Local-gov',
       'workclass_Never-worked', 'workclass_Private', 'workclass_Self-emp-inc',
       'workclass_Self-emp-not-inc', 'workclass_State-gov',
       'workclass_Without-pay', 'education_10th',
       ...
       'native-country_Trinadad&Tobago', 'native-country_United-States',
       'native-country_Vietnam', 'native-country_Yugoslavia', 'age', 'fnlwgt',
       'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'],
      dtype='object', length=108)

In [9]:
# If train-test split is desired
X_train = joblib.load('../data/salary/X_train.joblib')
X_test = joblib.load('../data/salary/X_test.joblib')
y_train = joblib.load('../data/salary/y_train.joblib')
y_test = joblib.load('../data/salary/y_test.joblib')

#### 1. Threshold guess: we first use the gradient boosted tree to guess useful thresholds. 

In [10]:
# GBDT parameters for threshold and lower bound guesses
n_est = 40
max_depth = 1

In [11]:
# guess thresholds
X_train = pd.DataFrame(X_train, columns=h)
X_test = pd.DataFrame(X_test, columns=h)
X_train_guessed, thresholds, header, threshold_guess_time = compute_thresholds(X_train.copy(), y_train, n_est, max_depth)
X_test_guessed = cut(X_test.copy(), thresholds)
X_test_guessed = X_test_guessed[header]
print(X_train_guessed.shape)
print(X_test_guessed.shape)
print("train set column names == test set column names: {}".format(list(X_train_guessed.columns)==list(X_test_guessed.columns)))

(26048, 14)
(6513, 14)
train set column names == test set column names: True


#### 2. Lower bound guess: we use the gradient boosted tree to get labels for the training dataset. 

In [12]:
# guess lower bound
import time
start_time = time.perf_counter()
clf = GradientBoostingClassifier(n_estimators=n_est, max_depth=max_depth, random_state=random_state)
clf.fit(X_train_guessed, y_train)
warm_labels = clf.predict(X_train_guessed)

elapsed_time = time.perf_counter() - start_time

lb_time = elapsed_time

In [13]:
# save the labels as a tmp file and return the path to it.
labelsdir = pathlib.Path('./tmp/warm_lb_labels')
labelsdir.mkdir(exist_ok=True, parents=True)

labelpath = labelsdir / 'warm_label.tmp'
labelpath = str(labelpath)
pd.DataFrame(warm_labels).to_csv(labelpath, header="class_labels",index=None) # TODO: verify this formats correctly for gosdt (shouldn't require headers)


#### 3. train GOSDT model

In [14]:
# train GOSDT model
config = {
            "regularization": 0.001,
            "depth_budget": 5,
            "time_limit": 60,
            "warm_LB": True,
            "path_to_labels": labelpath,
            "similar_support": False
        }

model = GOSDT(config)

model.fit(X_train_guessed, pd.DataFrame(y_train))

print("evaluate the model, extracting tree and scores", flush=True)

gosdt reported successful execution
training completed. 0.000/0.000/0.212 (user, system, wall), mem=0 MB
bounds: [0.161327..0.161327] (0.000000) loss=0.156327, iterations=1635
evaluate the model, extracting tree and scores


#### 4. Get results

In [15]:
# get the results
train_acc = model.score(X_train_guessed, y_train)
test_acc = model.score(X_test_guessed, y_test)
n_leaves = model.leaves()
n_nodes = model.nodes()
time = model.utime

print("Model training time: {}".format(time))
print("Training accuracy: {}".format(train_acc))
print("Test accuracy: {}".format(test_acc))
print("# of leaves: {}".format(n_leaves))
print(model.tree)

Model training time: 0.0
Training accuracy: 0.8436732186732187
Test accuracy: 0.8452326116996776
# of leaves: 5
if capital-gain<=5119.0 = 1 and capital-loss<=1820.5 = 1 and marital-status_Married-civ-spouse<=0.5 = 1 then:
    predicted class: <=50K
    misclassification penalty: 0.024
    complexity penalty: 0.001

else if capital-gain<=5119.0 = 1 and capital-loss<=1820.5 = 1 and education-num<=12.5 = 1 and marital-status_Married-civ-spouse<=0.5 != 1 then:
    predicted class: <=50K
    misclassification penalty: 0.084
    complexity penalty: 0.001

else if capital-gain<=5119.0 = 1 and capital-loss<=1820.5 = 1 and education-num<=12.5 != 1 and marital-status_Married-civ-spouse<=0.5 != 1 then:
    predicted class: >50K
    misclassification penalty: 0.038
    complexity penalty: 0.001

else if capital-gain<=5119.0 = 1 and capital-loss<=1820.5 != 1 then:
    predicted class: >50K
    misclassification penalty: 0.008
    complexity penalty: 0.001

else if capital-gain<=5119.0 != 1 then:
  

As you can see, we find a reasonably accurate model quite quickly on datasets as complicated as FICO!

Thank you for reading our tutorial. Please do try out our methods with different parameters and datasets. Happy tree training!