In [1]:
import pandas as pd
import numpy as np
import smac

In [2]:
# Import ConfigSpace and different types of parameters
from smac.configspace import ConfigurationSpace
from ConfigSpace.hyperparameters import (CategoricalHyperparameter, 
                                         NormalFloatHyperparameter,
                                            UniformFloatHyperparameter, 
            UniformIntegerHyperparameter, 
            IntegerHyperparameter)
from ConfigSpace.conditions import InCondition

# Import SMAC-utilities
from smac.tae.execute_func import ExecuteTAFuncDict


In [3]:
from smac.scenario.scenario import Scenario
from smac.facade.smac_facade import SMAC

In [79]:
import logging
import numpy as np
from sklearn import svm, datasets
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score
from timeit import default_timer as timer
import lightgbm as lgb
from hyperopt import hp

In [98]:
features = pd.read_csv('data/ft_2000_important.csv')
submit_base = pd.read_csv('data/test.csv')[['Id', 'idhogar']]

train = features[features['Target'].notnull()].copy()
test = features[features['Target'].isnull()].copy()

train_labels = np.array(train.pop('Target'))
test_ids = list(test.pop('idhogar'))

train, test = train.align(test, join = 'inner', axis = 1)

features = pd.read_csv('data/ft_2000_important.csv')
submit_base = pd.read_csv('data/test.csv')[['Id', 'idhogar']]

train = features[features['Target'].notnull()].copy()
test = features[features['Target'].isnull()].copy()

train_labels = np.array(train.pop('Target'))
test_ids = list(test.pop('idhogar'))

train, test = train.align(test, join = 'inner', axis = 1)

  interactivity=interactivity, compiler=compiler, result=result)


In [80]:
# Define the search space
space = {
    'boosting_type': hp.choice('boosting_type', 
                              [{'boosting_type': 'gbdt', 
                                'subsample': hp.uniform('gdbt_subsample', 0.5, 1),
                                'subsample_freq': hp.quniform('gbdt_subsample_freq', 1, 10, 1)}, 
                               {'boosting_type': 'dart', 
                                 'subsample': hp.uniform('dart_subsample', 0.5, 1),
                                 'subsample_freq': hp.quniform('dart_subsample_freq', 1, 10, 1),
                                 'drop_rate': hp.uniform('dart_drop_rate', 0.1, 0.5)},
                                {'boosting_type': 'goss',
                                 'subsample': 1.0,
                                 'subsample_freq': 0}]),
    'limit_max_depth': hp.choice('limit_max_depth', [True, False]),
    'max_depth': hp.quniform('max_depth', 1, 40, 1),
    'num_leaves': hp.quniform('num_leaves', 3, 50, 1),
    'learning_rate': hp.loguniform('learning_rate', 
                                   np.log(0.025), 
                                   np.log(0.25)),
    'subsample_for_bin': hp.quniform('subsample_for_bin', 2000, 100000, 2000),
    'min_child_samples': hp.quniform('min_child_samples', 5, 80, 5),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.5, 1.0)
}

In [81]:
from hyperopt.pyll.stochastic import sample

sample(space)

{'boosting_type': {'boosting_type': 'dart',
  'drop_rate': 0.14096616663541625,
  'subsample': 0.9238708887637695,
  'subsample_freq': 9.0},
 'colsample_bytree': 0.8818559103073851,
 'learning_rate': 0.03509027454510622,
 'limit_max_depth': False,
 'max_depth': 22.0,
 'min_child_samples': 55.0,
 'num_leaves': 11.0,
 'reg_alpha': 0.8414587667651955,
 'reg_lambda': 0.5327558904022563,
 'subsample_for_bin': 2000.0}

In [82]:
def macro_f1_score(labels, predictions):
    # Reshape the predictions as needed
    predictions = predictions.reshape(len(np.unique(labels)), -1 ).argmax(axis = 0)
    
    metric_value = f1_score(labels, predictions, average = 'macro')
    
    # Return is name, value, is_higher_better
    return 'macro_f1', metric_value, True

In [116]:
def objective(hyperparameters):
    """Return validation score from hyperparameters for LightGBM"""
    
    # Keep track of evals
#     global ITERATION
#     ITERATION += 1
    
    # Retrieve the subsample
    # subsample = hyperparameters['boosting_type'].get('subsample', 1.0)
    # subsample_freq = hyperparameters['boosting_type'].get('subsample_freq', 0)
    
    hyperparameters = {k : hyperparameters[k] for k in hyperparameters if hyperparameters[k]}
    
    limit_max_depth = hyperparameters['limit_max_depth']
    
    if limit_max_depth == 'false':
        del hyperparameters['max_depth']
    
#     boosting_type = hyperparameters['boosting_type']
    
#     # Use drop rate with dart
#     if boosting_type == 'dart':
#         hyperparameters['drop_rate'] = hyperparameters['boosting_type']['drop_rate']
        
    # Assign subsample and subsample freq to top level keys
#     hyperparameters['boosting_type'] = boosting_type
#     hyperparameters['subsample'] = subsample
#     hyperparameters['subsample_freq'] = subsample_freq
    
    # Make sure parameters that need to be integers are integers
    for parameter_name in ['num_leaves', 'subsample_for_bin', 
                           'min_child_samples', 'subsample_freq']:
        hyperparameters[parameter_name] = int(hyperparameters[parameter_name])

    if 'n_estimators' in hyperparameters:
        del hyperparameters['n_estimators']
    
    # Using stratified kfold cross validation
    strkfold = StratifiedKFold(n_splits = 5, shuffle = True)
    
    # Convert to arrays for indexing
    features = np.array(train)
    labels = np.array(train_labels).reshape((-1 ))
    
    valid_scores = []
    best_estimators = []
    run_times = []
    
    model = lgb.LGBMClassifier(**hyperparameters, class_weight = 'balanced',
                               n_jobs=-1, metric = 'None',
                               n_estimators=10000)
    
    # Iterate through the folds
    for i, (train_indices, valid_indices) in enumerate(strkfold.split(features, labels)):
        
        # Training and validation data
        X_train = features[train_indices]
        X_valid = features[valid_indices]
        y_train = labels[train_indices]
        y_valid = labels[valid_indices]
        
        start = timer()
        # Train with early stopping
        model.fit(X_train, y_train, early_stopping_rounds = 100, 
                  eval_metric = macro_f1_score, 
                  eval_set = [(X_train, y_train), (X_valid, y_valid)],
                  eval_names = ['train', 'valid'],
                  verbose = 400)
        end = timer()
        # Record the validation fold score
        valid_scores.append(model.best_score_['valid']['macro_f1'])
        best_estimators.append(model.best_iteration_)
        
        run_times.append(end - start)
    
    score = np.mean(valid_scores)
    score_std = np.std(valid_scores)
    loss = 1 - score
    
    run_time = np.mean(run_times)
    run_time_std = np.std(run_times)
    
    estimators = int(np.mean(best_estimators))
    # hyperparameters['n_estimators'] = estimators
    print(type(hyperparameters))
    # Write to the csv file ('a' means append)
#     of_connection = open(OUT_FILE, 'a')
#     writer = csv.writer(of_connection)
#     writer.writerow([loss, hyperparameters, ITERATION, run_time, score, score_std])
#     of_connection.close()
    
#     # Display progress
#     if ITERATION % PROGRESS == 0:
#         display(f'Iteration: {ITERATION}, Current Score: {round(score, 4)}.')
    
#     return {'loss': loss, 'hyperparameters': hyperparameters, 'iteration': ITERATION,
#             'time': run_time, 'time_std': run_time_std, 'status': STATUS_OK, 
#             'score': score, 'score_std': score_std}

    return loss

In [122]:
boosting_type = CategoricalHyperparameter('boosting_type',
                                          ['goss', 'dart', 'gbdt'],
                                          default_value = 'dart')

limit_max_depth = CategoricalHyperparameter('limit_max_depth', 
                                            ['true', 'false'])

max_depth = UniformIntegerHyperparameter('max_depth', 1, 40)

num_leaves = UniformIntegerHyperparameter('num_leaves', 3, 50)


learning_rate = UniformFloatHyperparameter('learning_rate', 
                                     0.025, 0.25)

subsample_for_bin = UniformIntegerHyperparameter('subsample_for_bin', 
                                                 2000, 100000)

min_child_samples = UniformIntegerHyperparameter('min_child_samples', 5, 80,
                                                 default_value = 10)

reg_alpha = UniformFloatHyperparameter('reg_alpha', 0.0, 1.0)
reg_lambda = UniformFloatHyperparameter('reg_lambda', 0.0, 1.0)

colsample_bytree = UniformFloatHyperparameter('colsample_bytree',
                                              0.5, 1.0)


subsample = UniformFloatHyperparameter('subsample', 0.5, 1.0,
                                       default_value = 1.0)

subsample_freq = UniformIntegerHyperparameter('subsample_freq', 1, 10,
                                              default_value = None)

drop_rate = UniformFloatHyperparameter('drop_rate', 0.1, 0.5)

cs = ConfigurationSpace()

cs.add_hyperparameters([boosting_type, limit_max_depth, max_depth, num_leaves, 
                        learning_rate, subsample_for_bin, min_child_samples,
                        reg_alpha, reg_lambda, colsample_bytree, 
                        subsample, subsample_freq, drop_rate])

cs.add_condition(InCondition(child=max_depth, parent=limit_max_depth,
                             values = ['true']))

cs.add_condition(InCondition(child=subsample, parent=boosting_type, 
                             values = ['gbdt', 'dart']))

cs.add_condition(InCondition(child=subsample_freq, parent=boosting_type, 
                                            values = ['gbdt', 'dart']))

cs.add_condition(InCondition(child=drop_rate, parent=boosting_type, 
                                            values = ['dart']))


drop_rate | boosting_type in {'dart'}

In [123]:
scenario = Scenario({"run_obj": "quality", 
                     "runcount-limit": 2,
                     "cs": cs,
                     "deterministic": "true"})

INFO:smac.scenario.scenario.Scenario:Output to smac3-output_2018-08-16_21-11-49_148146


In [124]:
cs.get_default_configuration()

Configuration:
  boosting_type, Value: 'dart'
  colsample_bytree, Value: 0.75
  drop_rate, Value: 0.3
  learning_rate, Value: 0.1375
  limit_max_depth, Value: 'true'
  max_depth, Value: 20
  min_child_samples, Value: 10
  num_leaves, Value: 26
  reg_alpha, Value: 0.5
  reg_lambda, Value: 0.5
  subsample, Value: 1.0
  subsample_for_bin, Value: 51000
  subsample_freq, Value: 6

In [103]:
def_value = objective(cs.get_default_configuration())

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[18]	train's macro_f1: 0.828241	valid's macro_f1: 0.456539
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[166]	train's macro_f1: 0.95156	valid's macro_f1: 0.452733
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[260]	train's macro_f1: 0.982592	valid's macro_f1: 0.424449
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[22]	train's macro_f1: 0.833188	valid's macro_f1: 0.408718
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[24]	train's macro_f1: 0.836999	valid's macro_f1: 0.424985
<class 'dict'>


In [104]:
def_value

0.5665151821143468

In [125]:
# Optimize, using a SMAC-object
print("Optimizing! Depending on your machine, this might take a few minutes.")
smac = SMAC(scenario=scenario, rng=np.random.RandomState(42),
             tae_runner=objective)

incumbent = smac.optimize()

inc_value = objective(incumbent)

print("Optimized Value: %.2f" % (inc_value))

Optimizing! Depending on your machine, this might take a few minutes.
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[139]	train's macro_f1: 0.945695	valid's macro_f1: 0.435085
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[9]	train's macro_f1: 0.775774	valid's macro_f1: 0.393466
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[81]	train's macro_f1: 0.925549	valid's macro_f1: 0.440679
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[86]	train's macro_f1: 0.916037	valid's macro_f1: 0.451565
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[150]	train's macro_f1: 0.95284	valid's macro_f1: 0.436933
<class 'dict'>
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[25]	train's macro_f1: 0.795286	

INFO:smac.intensification.intensification.Intensifier:Challenger (0.5643) is better than incumbent (0.5685) on 1 runs.
INFO:smac.intensification.intensification.Intensifier:Changes in incumbent:
INFO:smac.intensification.intensification.Intensifier:  boosting_type : 'dart' -> 'gbdt'
INFO:smac.intensification.intensification.Intensifier:  colsample_bytree : 0.75 -> 0.75919495684058
INFO:smac.intensification.intensification.Intensifier:  learning_rate : 0.1375 -> 0.1387213776530116
INFO:smac.intensification.intensification.Intensifier:  max_depth : 20 -> 28
INFO:smac.intensification.intensification.Intensifier:  min_child_samples : 10 -> 30
INFO:smac.intensification.intensification.Intensifier:  num_leaves : 26 -> 16
INFO:smac.intensification.intensification.Intensifier:  reg_alpha : 0.5 -> 0.6552779076340373
INFO:smac.intensification.intensification.Intensifier:  reg_lambda : 0.5 -> 0.6539670570919799
INFO:smac.intensification.intensification.Intensifier:  subsample : 1.0 -> 0.721557676

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[65]	train's macro_f1: 0.917477	valid's macro_f1: 0.437435
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[69]	train's macro_f1: 0.916964	valid's macro_f1: 0.462649
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[47]	train's macro_f1: 0.866527	valid's macro_f1: 0.458876
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[64]	train's macro_f1: 0.908424	valid's macro_f1: 0.454158
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[84]	train's macro_f1: 0.941494	valid's macro_f1: 0.43612
<class 'dict'>
Optimized Value: 0.55


In [119]:
# We load the iris-dataset (a widely used benchmark)
iris = datasets.load_iris()

def svm_from_cfg(cfg):
    """ Creates a SVM based on a configuration and evaluates it on the
    iris-dataset using cross-validation.
    Parameters:
    -----------
    cfg: Configuration (ConfigSpace.ConfigurationSpace.Configuration)
        Configuration containing the parameters.
        Configurations are indexable!
    Returns:
    --------
    A crossvalidated mean score for the svm on the loaded data-set.
    """
    # For deactivated parameters, the configuration stores None-values.
    # This is not accepted by the SVM, so we remove them.
    cfg = {k : cfg[k] for k in cfg if cfg[k]}
    # We translate boolean values:
    cfg["shrinking"] = True if cfg["shrinking"] == "true" else False
    # And for gamma, we set it to a fixed value or to "auto" (if used)
    if "gamma" in cfg:
        cfg["gamma"] = cfg["gamma_value"] if cfg["gamma"] == "value" else "auto"
        cfg.pop("gamma_value", None)  # Remove "gamma_value"

    clf = svm.SVC(**cfg, random_state=42)

    scores = cross_val_score(clf, iris.data, iris.target, cv=5)
    return 1-np.mean(scores)  # Minimize!

#logger = logging.getLogger("SVMExample")
logging.basicConfig(level=logging.INFO)  # logging.DEBUG for debug output

# Build Configuration Space which defines all parameters and their ranges
cs = ConfigurationSpace()

# We define a few possible types of SVM-kernels and add them as "kernel" to our cs
kernel = CategoricalHyperparameter("kernel", ["linear", "rbf", "poly", "sigmoid"], default_value="poly")
cs.add_hyperparameter(kernel)

# There are some hyperparameters shared by all kernels
C = UniformFloatHyperparameter("C", 0.001, 1000.0, default_value=1.0)
shrinking = CategoricalHyperparameter("shrinking", ["true", "false"], default_value="true")
cs.add_hyperparameters([C, shrinking])

# Others are kernel-specific, so we can add conditions to limit the searchspace
degree = UniformIntegerHyperparameter("degree", 1, 5, default_value=3)     # Only used by kernel poly
coef0 = UniformFloatHyperparameter("coef0", 0.0, 10.0, default_value=0.0)  # poly, sigmoid
cs.add_hyperparameters([degree, coef0])
use_degree = InCondition(child=degree, parent=kernel, values=["poly"])
use_coef0 = InCondition(child=coef0, parent=kernel, values=["poly", "sigmoid"])
cs.add_conditions([use_degree, use_coef0])

# This also works for parameters that are a mix of categorical and values from a range of numbers
# For example, gamma can be either "auto" or a fixed float
gamma = CategoricalHyperparameter("gamma", ["auto", "value"], default_value="auto")  # only rbf, poly, sigmoid
gamma_value = UniformFloatHyperparameter("gamma_value", 0.0001, 8, default_value=1)
cs.add_hyperparameters([gamma, gamma_value])
# We only activate gamma_value if gamma is set to "value"
cs.add_condition(InCondition(child=gamma_value, parent=gamma, values=["value"]))
# And again we can restrict the use of gamma in general to the choice of the kernel
cs.add_condition(InCondition(child=gamma, parent=kernel, values=["rbf", "poly", "sigmoid"]))


# Scenario object
scenario = Scenario({"run_obj": "quality",   # we optimize quality (alternatively runtime)
                     "runcount-limit": 200,  # maximum function evaluations
                     "cs": cs,               # configuration space
                     "deterministic": "true"
                     })

# Example call of the function
# It returns: Status, Cost, Runtime, Additional Infos
def_value = svm_from_cfg(cs.get_default_configuration())
print("Default Value: %.2f" % (def_value))


INFO:smac.scenario.scenario.Scenario:Output to smac3-output_2018-08-16_21-11-33_420397


Default Value: 0.03


In [6]:
# Optimize, using a SMAC-object
print("Optimizing! Depending on your machine, this might take a few minutes.")
smac = SMAC(scenario=scenario, rng=np.random.RandomState(42),
        tae_runner=svm_from_cfg)

incumbent = smac.optimize()

inc_value = svm_from_cfg(incumbent)

print("Optimized Value: %.2f" % (inc_value))


Optimizing! Depending on your machine, this might take a few minutes.


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: 0.0333
INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: 0.0333
INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: 0.0333
INFO:smac.intensification.intensification.Intensifier:Challenger (0.0267) is better than incumbent (0.0333) on 1 runs.
INFO:smac.intensification.intensification.Intensifier:Changes in incumbent:
INFO:smac.intensification.intensification.Intensifier:  C : 1.0 -> 10.939416021476587
INFO:smac.intensification.intensification.Intensifier:  kernel : 'poly' -> 'linear'
INFO:smac.intensification.intensification.Intensifier:  shrinking : 'true' -> 'false'
INFO:smac.intensification.intensification.Intensifier:Challenger (0.0200) is better than incumbent (0.0267) on 1 runs.
INFO:smac.intensification.intensification.Intensifier:Changes in incumbent:
INFO:smac.intensification.inte

Optimized Value: 0.01


In [7]:
import os

In [9]:
# We can also validate our results (though this makes a lot more sense with instances)
smac.validate(config_mode='inc',      # We can choose which configurations to evaluate
              #instance_mode='train+test',  # Defines what instances to validate
              repetitions=100,        # Ignored, unless you set "deterministic" to "false" in line 95
              n_jobs=1)               # How many cores to use in parallel for optimization

TypeError: validate() got an unexpected keyword argument 'output'