In [80]:
import pandas as pd
import numpy as np

import lightgbm as lgb

# Evaluation of the model
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score

In [41]:
features = pd.read_csv('data/ft_2000_important.csv')
features.shape

  interactivity=interactivity, compiler=compiler, result=result)


(10307, 2016)

In [42]:
submit_base = pd.read_csv('data/test.csv')[['Id', 'idhogar']]

In [44]:
train = features[features['Target'].notnull()].copy()
test = features[features['Target'].isnull()].copy()

train_labels = np.array(train.pop('Target'))
test_ids = list(test.pop('idhogar'))

train, test = train.align(test, join = 'inner', axis = 1)

In [46]:
for c in train:
    if train[c].dtype == 'object':
        train[c] = train[c].astype(np.float32)
        test[c] = test[c].astype(np.float32)

In [48]:
print('Train objects: ', train.columns[np.where(train.dtypes == 'object')])
print('Test objects: ', test.columns[np.where(test.dtypes == 'object')])

Train objects:  Index([], dtype='object')
Test objects:  Index([], dtype='object')


In [2]:
from mindfoundry.optaas.client.client import OPTaaSClient

with open('C:/Users/willk/OneDrive/Desktop/optaas_key.txt', 'r') as f:
    api_key = str(f.read())
    
client = OPTaaSClient('https://optaas.mindfoundry.ai', api_key)

In [58]:
from mindfoundry.optaas.client.parameter import Distribution, CategoricalParameter,IntParameter, ChoiceParameter, NumericParameter, FloatParameter
from mindfoundry.optaas.client.constraint import Constraint
from mindfoundry.optaas.client.client import Goal

In [81]:
def macro_f1_score(labels, predictions):
    # Reshape the predictions as needed
    predictions = predictions.reshape(len(np.unique(labels)), -1 ).argmax(axis = 0)
    
    metric_value = f1_score(labels, predictions, average = 'macro')
    
    # Return is name, value, is_higher_better
    return 'macro_f1', metric_value, True

def objective(num_leaves, learning_rate, boosting_type,
                      subsample, subsample_for_bin, min_child_samples,
                      reg_alpha, reg_lambda, colsample_bytree, nfolds=5):
    """Return validation score from hyperparameters for LightGBM"""

    # Using stratified kfold cross validation
    strkfold = StratifiedKFold(n_splits = nfolds, shuffle = True)
    
    # Convert to arrays for indexing
    features = np.array(train)
    labels = np.array(train_labels).reshape((-1 ))
    
    valid_scores = []
    best_estimators = []
    
    model = lgb.LGBMClassifier(num_leaves=num_leaves, learning_rate=learning_rate,
                               boosting_type=boosting_type, subsample=subsample,
                               subsample_for_bin=subsample_for_bin, 
                               min_child_samples=min_child_samples,
                               reg_alpha=reg_alpha, reg_lambda=reg_lambda, 
                               colsample_bytree=colsample_bytree,
                               class_weight = 'balanced',
                               n_jobs=-1, n_estimators=10000)
    
    # Iterate through the folds
    for i, (train_indices, valid_indices) in enumerate(strkfold.split(features, labels)):
        
        # Training and validation data
        X_train = features[train_indices]
        X_valid = features[valid_indices]
        y_train = labels[train_indices]
        y_valid = labels[valid_indices]
        
        # Train with early stopping
        model.fit(X_train, y_train, early_stopping_rounds = 100, 
                  eval_metric = macro_f1_score,
                  eval_set = [(X_train, y_train), (X_valid, y_valid)],
                  eval_names = ['train', 'valid'],
                  verbose = 400)
        
        # Record the validation fold score
        valid_scores.append(model.best_score_['valid']['macro_f1'])
        best_estimators.append(model.best_iteration_)
        
    best_estimators = np.array(best_estimators)
    valid_scores = np.array(valid_scores)
    
#     return valid_scores, best_estimators

    # Write to the csv file ('a' means append)
#     of_connection = open(OUT_FILE, 'a')
#     writer = csv.writer(of_connection)
#     writer.writerow([loss, hyperparameters, ITERATION, run_time, best_score, best_std])
#     of_connection.close()

    # Dictionary with information for evaluation
#     return {'loss': loss, 'hyperparameters': hyperparameters, 'iteration': ITERATION,
#             'train_time': run_time, 'status': STATUS_OK}

    return valid_scores.mean()

In [82]:
boosting_type = CategoricalParameter('boosting_type', 
                                     values = ['gbdt', 'dart', 'goss'], 
                                     id='boosting_type')

num_leaves = IntParameter('num_leaves', minimum=3, 
                          maximum=50, id='num_leaves')

learning_rate = FloatParameter('learning_rate', minimum=0.025, 
                               maximum=0.25, id='learning_rate',
                               distribution=Distribution.LOGUNIFORM)

subsample = FloatParameter('subsample', minimum=0.5, 
                           maximum=1.0, id='subsample')

subsample_for_bin = IntParameter('subsample_for_bin', minimum=2000, 
                                 maximum=100000, id='subsample_for_bin')

min_child_samples = IntParameter('min_child_samples', minimum=5, 
                                 maximum=80, id='min_child_samples')

reg_alpha = FloatParameter('reg_alpha', minimum=0.0, 
                           maximum=1.0, id='reg_alpha')

reg_lambda = FloatParameter('reg_lambda', minimum=0.0, 
                            maximum=1.0, id='reg_lambda')

colsample_bytree = FloatParameter('colsample_bytree', minimum=0.5, 
                                  maximum=1.0, id='colsample_bytree')


In [83]:
subsample_constraint = Constraint(when=boosting_type=='goss', 
                                  then=subsample==1)

In [84]:
task = client.create_task(
        title = 'Light GBM Opt',
        goal = Goal.max,
        parameters = [num_leaves, learning_rate, boosting_type,
                      subsample, subsample_for_bin, min_child_samples,
                      reg_alpha, reg_lambda, colsample_bytree],
         constraints = [ Constraint(when=boosting_type=='goss', 
                                    then=subsample==1)],
         target_score = 0.45
)

In [None]:
best_result, best_configuration = task.run(objective, max_iterations = 10)

Running task "Light GBM Opt" for 10 iterations
(or until target score 0.45 is reached)

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[24]	train's multi_logloss: 0.518707	train's macro_f1: 0.749626	valid's multi_logloss: 0.817886	valid's macro_f1: 0.407854
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[22]	train's multi_logloss: 0.541917	train's macro_f1: 0.734547	valid's multi_logloss: 0.867512	valid's macro_f1: 0.334719
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[25]	train's multi_logloss: 0.50887	train's macro_f1: 0.76704	valid's multi_logloss: 0.833868	valid's macro_f1: 0.382666
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[26]	train's multi_logloss: 0.490726	train's macro_f1: 0.781751	valid's multi_logloss: 0.838261	valid's macro_f1: 0.381592
Training until validation scor

Iteration: 5    Score: 0.3832819497590046
Configuration: {'num_leaves': 12, 'learning_rate': 0.18768075950567517, 'boosting_type': 'gbdt', 'subsample': 0.9049309842188232, 'subsample_for_bin': 56892, 'min_child_samples': 18, 'reg_alpha': 0.92637735625629, 'reg_lambda': 0.05869914257710396, 'colsample_bytree': 0.7798977510580736}

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[84]	train's multi_logloss: 0.507803	train's macro_f1: 0.755871	valid's multi_logloss: 0.809192	valid's macro_f1: 0.421317
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[145]	train's multi_logloss: 0.432586	train's macro_f1: 0.845112	valid's multi_logloss: 0.82231	valid's macro_f1: 0.415182
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[163]	train's multi_logloss: 0.433784	train's macro_f1: 0.857511	valid's multi_logloss: 0.830553	valid's macro_f1: 0.382918
Trai

In [56]:
task.generate_configurations(10)

[{ 'id': '68094917-5202-451d-8c43-45915cf0cac1',
   'type': 'exploration',
   'values': { 'boosting_type': 'gbdt',
               'colsample_bytree': 0.75,
               'learning_rate': 0.1375,
               'min_child_samples': 42,
               'num_leaves': 26,
               'reg_alpha': 0.5,
               'reg_lambda': 0.5,
               'subsample_for_bin': 51000}},
 { 'id': 'd8178825-25b5-4b97-9b77-1333c3f01314',
   'type': 'exploration',
   'values': { 'boosting_type': 'gbdt',
               'colsample_bytree': 0.75,
               'learning_rate': 0.1375,
               'min_child_samples': 42,
               'num_leaves': 26,
               'reg_alpha': 0.5,
               'reg_lambda': 0.5,
               'subsample': 1.0,
               'subsample_for_bin': 51000}},
 { 'id': '813aad41-980c-49e6-9f4f-ad21cc5855ec',
   'type': 'exploration',
   'values': { 'boosting_type': 'gbdt',
               'colsample_bytree': 0.5384484771722862,
               'learning_rate': 0.0