# Optimization As A Service (OPTaaS)

Testing out Mind Foundry's OPTaaS capabilities.
These results will be compared to Bayesian Optimization using Hyperopt and SMAC.

In [1]:
import pandas as pd
import numpy as np

import lightgbm as lgb

# Evaluation of the model
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score

In [2]:
features = pd.read_csv('data/ft_2000_important.csv')
features.shape

  interactivity=interactivity, compiler=compiler, result=result)


(10307, 2016)

# Data 

Created by Featuretools automated feature engineering

In [4]:
submit_base = pd.read_csv('data/test.csv')[['Id', 'idhogar']]

train = features[features['Target'].notnull()].copy()
test = features[features['Target'].isnull()].copy()

train_labels = np.array(train.pop('Target'))
test_ids = list(test.pop('idhogar'))

train, test = train.align(test, join = 'inner', axis = 1)

for c in train:
    if train[c].dtype == 'object':
        train[c] = train[c].astype(np.float32)
        test[c] = test[c].astype(np.float32)

In [6]:
print('Train objects: ', train.columns[np.where(train.dtypes == 'object')])
print('Test objects: ', test.columns[np.where(test.dtypes == 'object')])

Train objects:  Index([], dtype='object')
Test objects:  Index([], dtype='object')


In [7]:
from mindfoundry.optaas.client.client import OPTaaSClient, Goal
from mindfoundry.optaas.client.parameter import (Distribution, CategoricalParameter,
                                                 IntParameter, ChoiceParameter, 
                                                 NumericParameter, FloatParameter)

from mindfoundry.optaas.client.constraint import Constraint

with open('C:/Users/willk/OneDrive/Desktop/optaas_key.txt', 'r') as f:
    api_key = str(f.read())
    
client = OPTaaSClient('https://optaas.mindfoundry.ai', api_key)

# Objective Function

Takes in hyperparameters and returns a score to maximize.

In [9]:
def macro_f1_score(labels, predictions):
    # Reshape the predictions as needed
    predictions = predictions.reshape(len(np.unique(labels)), -1 ).argmax(axis = 0)
    
    metric_value = f1_score(labels, predictions, average = 'macro')
    
    # Return is name, value, is_higher_better
    return 'macro_f1', metric_value, True

def objective(num_leaves, learning_rate, boosting_type,
                      subsample, subsample_for_bin, min_child_samples,
                      reg_alpha, reg_lambda, colsample_bytree, nfolds=5):
    """Return validation score from hyperparameters for LightGBM"""

    # Using stratified kfold cross validation
    strkfold = StratifiedKFold(n_splits = nfolds, shuffle = True)
    
    # Convert to arrays for indexing
    features = np.array(train)
    labels = np.array(train_labels).reshape((-1 ))
    
    valid_scores = []
    best_estimators = []
    
    model = lgb.LGBMClassifier(num_leaves=num_leaves, learning_rate=learning_rate,
                               boosting_type=boosting_type, subsample=subsample,
                               subsample_for_bin=subsample_for_bin, 
                               min_child_samples=min_child_samples,
                               reg_alpha=reg_alpha, reg_lambda=reg_lambda, 
                               colsample_bytree=colsample_bytree,
                               class_weight = 'balanced',
                               n_jobs=-1, n_estimators=10000)
    
    # Iterate through the folds
    for i, (train_indices, valid_indices) in enumerate(strkfold.split(features, labels)):
        
        # Training and validation data
        X_train = features[train_indices]
        X_valid = features[valid_indices]
        y_train = labels[train_indices]
        y_valid = labels[valid_indices]
        
        # Train with early stopping
        model.fit(X_train, y_train, early_stopping_rounds = 100, 
                  eval_metric = macro_f1_score,
                  eval_set = [(X_train, y_train), (X_valid, y_valid)],
                  eval_names = ['train', 'valid'],
                  verbose = -1)
        
        # Record the validation fold score
        valid_scores.append(model.best_score_['valid']['macro_f1'])
        best_estimators.append(model.best_iteration_)
        
    best_estimators = np.array(best_estimators)
    valid_scores = np.array(valid_scores)
    
#     return valid_scores, best_estimators

    # Write to the csv file ('a' means append)
#     of_connection = open(OUT_FILE, 'a')
#     writer = csv.writer(of_connection)
#     writer.writerow([loss, hyperparameters, ITERATION, run_time, best_score, best_std])
#     of_connection.close()

    # Dictionary with information for evaluation
#     return {'loss': loss, 'hyperparameters': hyperparameters, 'iteration': ITERATION,
#             'train_time': run_time, 'status': STATUS_OK}

    return valid_scores.mean()

# Configuration

Define the hyperparameter distributions.

In [10]:
boosting_type = CategoricalParameter('boosting_type', 
                                     values = ['gbdt', 'dart', 'goss'], 
                                     id='boosting_type')

num_leaves = IntParameter('num_leaves', minimum=3, 
                          maximum=50, id='num_leaves')

learning_rate = FloatParameter('learning_rate', minimum=0.025, 
                               maximum=0.25, id='learning_rate',
                               distribution=Distribution.LOGUNIFORM)

subsample = FloatParameter('subsample', minimum=0.5, 
                           maximum=1.0, id='subsample')

subsample_for_bin = IntParameter('subsample_for_bin', minimum=2000, 
                                 maximum=100000, id='subsample_for_bin')

min_child_samples = IntParameter('min_child_samples', minimum=5, 
                                 maximum=80, id='min_child_samples')

reg_alpha = FloatParameter('reg_alpha', minimum=0.0, 
                           maximum=1.0, id='reg_alpha')

reg_lambda = FloatParameter('reg_lambda', minimum=0.0, 
                            maximum=1.0, id='reg_lambda')

colsample_bytree = FloatParameter('colsample_bytree', minimum=0.5, 
                                  maximum=1.0, id='colsample_bytree')


In [11]:
subsample_constraint = Constraint(when=boosting_type=='goss', 
                                  then=subsample==1)

## Create a Task

In [12]:
task = client.create_task(
        title = 'Light GBM Opt',
        goal = Goal.max,
        parameters = [num_leaves, learning_rate, boosting_type,
                      subsample, subsample_for_bin, min_child_samples,
                      reg_alpha, reg_lambda, colsample_bytree],
         constraints = [ Constraint(when=boosting_type=='goss', 
                                    then=subsample==1)]
)

# Run Optimization

In [13]:
%%capture 
best_result, best_configuration = task.run(objective, max_iterations = 100)

## Show Results

In [14]:
best_configuration

{ 'id': '132dc8d6-bab1-4695-b058-6fcbdbf21684',
  'type': 'exploitation',
  'values': { 'boosting_type': 'dart',
              'colsample_bytree': 0.9843467236959204,
              'learning_rate': 0.11598629586769524,
              'min_child_samples': 44,
              'num_leaves': 49,
              'reg_alpha': 0.35397370408131534,
              'reg_lambda': 0.5904910774606467,
              'subsample': 0.6299872254632797,
              'subsample_for_bin': 60611}}

In [15]:
best_result

{ 'configuration': '132dc8d6-bab1-4695-b058-6fcbdbf21684',
  'id': 3250,
  'score': 0.4629755551376399,
  'user_defined_data': None}

In [16]:
import json
with open('task_results.txt', 'w') as f:
    json.dump(str(task.get_results()))

TypeError: dump() missing 1 required positional argument: 'fp'