<a href="https://colab.research.google.com/github/WideSu/Python-for-DS/blob/main/code/HyperOpt_HyperParam_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

TO-DO
- [x] Test the average time usage and RMSE for each epoch using scikit-learn random search
- [ ] Test TPE hyper param tuning for HyperOpt, Ray, Optuna
- [ ] Plot the RMSE through timeline
- [ ] Use the different sampler in Optuna: Random,TPE,CMA-ES,NSGA-II

The outcome:
- A chart consisting the average RMSE and excuation time for all hyper parameter tunning methods

|HPO Package                                  |Avg RMSE                        |Avg Time Ellipsed                                            |
|---------------------------------------------|--------------------------------|-------------------------------------------------------------|
|Scikit-learn                                 |                                |                                                             |
|HyperOpt                                     |                                |                                                             |
|Ray                                          |                                |                                                             |
|Optuna                                       |                                |                                                             |

|Library|Pros|Cons|Scenario|
|-|-|-|-|
|Scikit-learn|Flexible and basic|Only 2 basic methods (grid/random), New methods are not stable|Tradictional tuning|
|HyperOpt|High-speed and flexible,New search method: TPE/ATPE| Out-of-date interface |Time-limited|
|Ray|Systematic and well wrapped|Too customized and not flexible,Time-cost on initialization|Fast development and deployment with various tuning methods|
|Optuna|Well-performed and light;Include all popular and stable tuning methods |Not well wrapped for all methods|Accurate, flexible required|


In [None]:
# @title Mont on Google Drive
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/HPO/

Mounted at /content/drive
/content/drive/MyDrive/HPO


In [None]:
# @title Install and import packages
! pip install dateutil
! pip install lightgbm
! pip install optuna
import pandas as pd
import dateutil
import datetime
import optuna
from tqdm import tqdm, trange
from lightgbm import LGBMRegressor
import sklearn
import math
import time

In [None]:
# @title Read-in data and check data type and volume
df = pd.read_csv('./exp_data.csv')
df.info()

In [None]:
# @title Change into datatime type
df[["date"]] = df[["date"]].apply(pd.to_datetime)

In [None]:
import pickle
import time
import hyperopt
from hyperopt import fmin, hp, Trials

library_evaluation_df = {
    'Library' : [],
    'Train Start Date': [],
    'Train End Date': [],
    'Test Start Date': [],
    'Test End Date': [],
    'Smallest RMSE': [],
    'Time Ellipsed': []
}

n_trials = 15

predict_times = 60

# Configuration 
train_timespan_months = 180
whole_period_months = 60
test_timespan_months = 1
first_end_time = datetime.datetime(2015, 12, 1)
feat_cols = ['absacc', 'acc', 'age', 'agr', 'baspread','bm', 'bm_ia',
             'cash', 'cashdebt', 'cashpr', 'cfp', 'cfp_ia', 'chatoia', 'chcsho', 'chempia', 'chinv', 'chmom',
             'chpmia', 'chtx', 'cinvest', 'convind', 'currat', 'depr', 'divi', 'divo', 'dolvol', 'dy', 
             'egr', 'ep', 'gma', 'grcapx', 'grltnoa', 'herf', 'hire', 'ill', 'indmom', 'invest', 'lev', 'lgr',
             'maxret', 'mom12m', 'mom1m', 'mom36m', 'mom6m', 'ms', 'mve_ia', 'mvel1', 'nincr', 'operprof',
             'orgcap', 'pchcapx_ia', 'pchcurrat', 'pchdepr', 'pchgm_pchsale', 'pchquick', 'pchsale_pchinvt',
             'pchsale_pchrect', 'pchsale_pchxsga', 'pchsaleinv', 'pctacc', 'ps', 'quick', 'rd', 'rd_mve',
             'rd_sale', 'realestate', 'retvol', 'roaq', 'roavol', 'roeq', 'roic', 'rsup', 'salecash', 'pricedelay',
             'saleinv', 'salerec', 'secured', 'securedind', 'sgr', 'sin', 'sp', 'std_dolvol', 'std_turn',
             'stdacc', 'stdcf', 'tang', 'tb', 'turn', 'zerotrade','aeavol','ear','beta','betasq','idiovol']
y_col = 'ret'

train_end_date = first_end_time

# Define the search space
space = {
        'n_estimators': hp.quniform('n_estimators', 50, 500, 1), 
        'num_leaves': hp.quniform('num_leaves', 10, 512, 1),
        'min_data_in_leaf': hp.quniform('min_data_in_leaf', 10, 80, 1),
        'bagging_fraction':  hp.uniform('bagging_fraction', 0.0, 1.0), # subsample
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.1),  # eta
        'lambda_l1': hp.uniform('lambda_l1', 0.01, 1),  # reg_alpha
        'lambda_l2': hp.uniform('lambda_l2', 0.01, 1), # reg_lambda
}

# Evaluation details for each train and test timespan
evaluate_detail_df = {
    'Train Start Date': [],
    'Train End Date': [],
    'Test Start Date': [],
    'Test End Date': [],
    'Smallest RMSE': [],
    'Time Ellipsed': []
}

# HyperOpt

In [None]:
# Run the backtest for 5 years
for period_time in trange(predict_times):
    train_start_date = train_end_date - dateutil.relativedelta.relativedelta(months=train_timespan_months)
    test_end_date = train_end_date + dateutil.relativedelta.relativedelta(months=test_timespan_months)
    train_data = df.query(f'"{train_start_date}" < date <= "{train_end_date}"')
    test_data = df.query(f'"{train_end_date}" < date <= "{test_end_date}"')
    X_train = train_data[feat_cols].values
    y_train = train_data[y_col].values
    X_test = test_data[feat_cols].values
    y_test = test_data[y_col].values.ravel()
    def objective(params):
        param_dict = dict(
            n_estimators = int(params['n_estimators']),
            num_leaves = int(params['num_leaves']),
            min_data_in_leaf = int(params['min_data_in_leaf']),
            bagging_fraction = params['bagging_fraction'],
            learning_rate = params['learning_rate'],
            lambda_l1 = params['lambda_l1'],
            lambda_l2 = params['lambda_l2']
        )
        model = LGBMRegressor(seed=42,**param_dict)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = sklearn.metrics.mean_squared_error(y_test, y_pred)
        rmse = math.sqrt(mse)
        return rmse  # An objective value linked with the Trial object.
    ts = time.time()
    
    trials = Trials()
    best = fmin(objective,
        space=space,
        algo=hyperopt.rand.suggest, # random search
        max_evals=n_trials,
        trials=trials)

    te = time.time()
    exc_time = te-ts

    best_result = min(trials.losses())
    evaluate_detail_df['Smallest RMSE'].append(best_result)
    evaluate_detail_df['Time Ellipsed'].append(exc_time)
    evaluate_detail_df['Train Start Date'].append(train_start_date)
    evaluate_detail_df['Train End Date'].append(train_end_date)
    evaluate_detail_df['Test Start Date'].append(train_end_date+dateutil.relativedelta.relativedelta(months=1))
    evaluate_detail_df['Test End Date'].append(test_end_date)
    train_end_date += dateutil.relativedelta.relativedelta(months=1)
evaluate_detail_df = pd.DataFrame(evaluate_detail_df)

In [None]:
evaluate_detail_df
library_evaluation_df['Library'].extend(['HyperOpt' for _ in range(len(evaluate_detail_df))])
library_evaluation_df['Train Start Date'].extend(evaluate_detail_df['Train Start Date'])
library_evaluation_df['Train End Date'].extend(evaluate_detail_df['Train End Date'])
library_evaluation_df['Test Start Date'].extend(evaluate_detail_df['Test Start Date'])
library_evaluation_df['Test End Date'].extend(evaluate_detail_df['Test End Date'])
library_evaluation_df['Smallest RMSE'].extend(evaluate_detail_df['Smallest RMSE'])
library_evaluation_df['Time Ellipsed'].extend(evaluate_detail_df['Time Ellipsed'])

In [None]:
hyperopt_tune_res = pd.DataFrame(library_evaluation_df)
hyperopt_tune_res.to_pickle('hyperopt_tune_res.pkl')

In [None]:
hyperopt_tune_res

Unnamed: 0,Library,Train Start Date,Train End Date,Test Start Date,Test End Date,Smallest RMSE,Time Ellipsed
0,HyperOpt,2000-12-01,2015-12-01,2016-01-01,2016-01-01,0.118212,652.687818
1,HyperOpt,2001-01-01,2016-01-01,2016-02-01,2016-02-01,0.068459,523.28443
2,HyperOpt,2001-02-01,2016-02-01,2016-03-01,2016-03-01,0.090916,558.93683
3,HyperOpt,2001-03-01,2016-03-01,2016-04-01,2016-04-01,0.072256,511.158078
4,HyperOpt,2001-04-01,2016-04-01,2016-05-01,2016-05-01,0.057222,661.07788
5,HyperOpt,2001-05-01,2016-05-01,2016-06-01,2016-06-01,0.067168,510.424067
6,HyperOpt,2001-06-01,2016-06-01,2016-07-01,2016-07-01,0.06721,601.716901
7,HyperOpt,2001-07-01,2016-07-01,2016-08-01,2016-08-01,0.064283,639.067829
8,HyperOpt,2001-08-01,2016-08-01,2016-09-01,2016-09-01,0.051814,677.336538
9,HyperOpt,2001-09-01,2016-09-01,2016-10-01,2016-10-01,0.077283,537.86938
