## Importing the Packages&Data

In [79]:
#!pip install optuna

In [80]:
import pandas as pd
import numpy as np
import matplotlib
import plotly

# Models to use
import lightgbm as lgb
import catboost as cb

# Importing the metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
#from sklearn.metrics import confusion_matrix
#from sklearn.metrics import plot_confusion_matrix

# For measuring the training time taken during the fit process
import time

#from hyperopt import hp
#from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials, space_eval

# Importing the Scalers
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import optuna

In [81]:
df = pd.read_csv('../input/higgs-cleaned/higgs_cleaned.csv')
df.head()

In [82]:
X, y = df.drop('class', axis=1), df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1864)

In [83]:
# df_train, df_test = train_test_split(df, test_size=0.2, random_state=1864)

## Normalizing the Features into range [0-1]

Scaling is surely an important part of the pipeline and I will be using MinMaxScaler to this end. One can turn the feature values into the standard normal range as well but the features do not always have the gaussian shape, I've done the basic tests for checking this in the baselines notebook. 

In [84]:
# There is no need to scale labels since they are already in the MinMaxScaler range [0-1]
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [85]:
#converting the dataset into proper LGB format 
d_train=lgb.Dataset(X_train, label=y_train)

For the meaning of search functions such as hp.uniform, you can take a look at here: http://hyperopt.github.io/hyperopt/getting-started/search_spaces/.

## Defining the Search Space & Objective Function

In [86]:
def objective(trial):
    
    classifier_parameters = {
    'learning_rate':    trial.suggest_float('learning_rate', 0.2, 0.3, step=0.005),
    'max_depth':        trial.suggest_int('max_depth', 6, 10, step=1),
    'min_child_weight': trial.suggest_int('min_child_weight', 1, 8, step=1),
    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.8, 1, step=0.05),
    'subsample':        trial.suggest_uniform('subsample', 0.7, 1),
    'num_iterations':     trial.suggest_categorical('num_iterations ', [150, 200, 350, 500]),
    'min_child_samples':trial.suggest_int('min_child_samples', 100, 300, step = 25),
    'num_leaves':       trial.suggest_int('num_leaves', 20, 50, step = 5),
    'objective':        'binary',
    'metric':           'auc',  
    'boosting_type':    'dart',
    'feature_pre_filter':False,
    'random_seed':      1864
}

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "auc")
    cv_results = lgb.cv(classifier_parameters, d_train, nfold=5, verbose_eval = False, early_stopping_rounds=90, callbacks=[pruning_callback])
    best_auc = cv_results['auc-mean'][-1]
    #acc = cross_val_score(clf, X_train, y_train, scoring='accuracy').mean()
    
    return best_auc

## Hyperparameter Tuning

We can now start the trials. Optuna is much easier to use compared to Hyperopt since the study object logs the important stuff regarding the trials already.

In [87]:
study = optuna.create_study(direction = 'maximize', study_name='Optuna_Basic_Study')
study.optimize(objective, n_trials=100, show_progress_bar=True)

In [88]:
print('Best 5-Fold CV Score on train set:', round(study.best_value, 8))

In [89]:
print('Best Parameters')
print('-'*50)
for k,v in study.best_params.items():
    print(k,':',v)

In [90]:
print('Best Trial Index:', study.best_trial.number)

## Plots of the Initial Hyperparameter Search

In [98]:
import plotly.offline as pyo
pyo.init_notebook_mode()

In [99]:
optuna.visualization.plot_param_importances(study)

In [100]:
optuna.visualization.plot_param_importances(
    study, target=lambda t: t.duration.total_seconds(), target_name="duration"
)

In [101]:
optuna.visualization.plot_slice(study)

In [102]:
optuna.visualization.plot_optimization_history(study)

In [103]:
optuna.visualization.plot_parallel_coordinate(study)

## Applying the Tuned Model on Test Data

In [104]:
best_params = study.best_params

clf = lgb.LGBMClassifier(**best_params)
clf.fit(X_train, y_train, verbose=False)

preds = clf.predict(X_test)

print(accuracy_score(y_test, preds))

## Saving the Model into .json/.txt

In [None]:
clf.booster_.save_model('model_minmax_lgbm_optuna_1.txt')

In [None]:
"""clf.save_model(
    "model3_minmax_lgbm_dart.json",
    format="json",
    # pool=pool  # this parameter is required only for models with categorical features.
)"""