In [1]:
import pandas as pd

#import sys
#sys.path.insert(0, "./smbox")

from smbox.utils import Logger
from smbox.optimise import Optimise
from smbox.smbox_config import smbox_params
from smbox.paramspace import rf_default_param_space

In [2]:
logger = Logger()

##---- config
# Define a configuration dict to hold all key information
global config
config = {'dataset_source': 'openml'
    , 'dataset': 38
    , 'algorithm': 'rf'
    , 'search_strategy': 'smbox'
    , 'search_strategy_config': smbox_params
    , 'wallclock': 600
    , 'output_root': '/Users/salhit/development/smbox/smbox/test/resources/output/'
          }
logger.log(f'Experiment Config: {config}', 'DEBUG')
##----

_random_seed = 42

In [5]:
def fetch_open_ml_data(dataset_id):
    import openml
    
    dataset = openml.datasets.get_dataset(dataset_id)
    print(dataset)

    X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="array", target=dataset.default_target_attribute
    )
    df = pd.DataFrame(X, columns=attribute_names)
    df["target"] = y

    return df, 'target'

In [8]:
df, target_name = fetch_open_ml_data(38)

#df, target_name = pd.read_csv('/Users/salhit/development/smbox/smbox/test/resources/dataset_38.csv'), 'target'

OpenML Dataset
Name..........: sick
Version.......: 1
Format........: ARFF
Upload Date...: 2014-04-06 23:22:19
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/38/sick.arff
OpenML URL....: https://www.openml.org/d/38
# of features.: 30
# of instances: 3772


In [9]:
display(df.head())

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target
0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,125.0,0.0,1.14,0.0,109.0,0.0,,0.0,0
1,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,102.0,1.0,,1.0,,0.0,,1.0,0
2,46.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,109.0,0.0,0.91,0.0,120.0,0.0,,1.0,0
3,70.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,175.0,1.0,,1.0,,0.0,,1.0,0
4,70.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,61.0,0.0,0.87,0.0,70.0,0.0,,2.0,0


In [11]:
y = df[target_name]
X = df.drop(target_name, axis=1)
X.fillna(0, inplace=True)

In [12]:
data = {"X_train": X, "y_train":y} # requried data format for SMBOX

# use default hperparameter search space
cfg_schema = rf_default_param_space

logger.log(f'-------------Starting SMBOX')
logger.log(f'Initial configuration schema: {cfg_schema}', 'DEBUG')

optimiser = Optimise(config, _random_seed)
best_parameters = optimiser.SMBOXOptimise(data, cfg_schema)

2023-08-24 21:14:10: -------------Starting SMBOX
2023-08-24 21:14:10: Starting run for: 38, for 600 seconds
2023-08-24 21:14:26: Global best so far: 0.9631951758565462
2023-08-24 21:14:29: improvement: 0.027920885052071576
2023-08-24 21:14:29: Global best so far: 0.9911160609086178
2023-08-24 21:14:42: improvement: 0.0009919733751566628
2023-08-24 21:14:42: Global best so far: 0.9921080342837745
2023-08-24 21:15:20: improvement: 0.0015330720130961595
2023-08-24 21:15:20: Global best so far: 0.9936411062968706
2023-08-24 21:16:19: improvement: 0.0014203371267609288
2023-08-24 21:16:19: Global best so far: 0.9950614434236316
2023-08-24 21:20:37: improvement: 0.0004165437916237025
2023-08-24 21:20:37: Global best so far: 0.9954779872152553
2023-08-24 21:20:42: improvement: 1.2693133702224024e-05
2023-08-24 21:20:42: Global best so far: 0.9954906803489575
2023-08-24 21:24:12: Global best: 0.9954906803489575
2023-08-24 21:24:12: Best params: {'max_features': 0.6148978972682412, 'n_estimator