<a href="https://colab.research.google.com/github/WideSu/Python-for-DS/blob/main/Optuna_HyperParam_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

TO-DO
- [x] Test the average time usage and RMSE for each epoch using scikit-learn random search
- [ ] Test TPE hyper param tuning for HyperOpt, Ray, Optuna
- [ ] Plot the RMSE through timeline
- [ ] Use the different sampler in Optuna: Random,TPE,CMA-ES,NSGA-II

The outcome:
- A chart consisting the average RMSE and excuation time for all hyper parameter tunning methods

|HPO Package                                  |Avg RMSE                        |Avg Time Ellipsed                                            |
|---------------------------------------------|--------------------------------|-------------------------------------------------------------|
|Scikit-learn                                 |                                |                                                             |
|HyperOpt                                     |                                |                                                             |
|Ray                                          |                                |                                                             |
|Optuna                                       |                                |                                                             |


In [None]:
# @title Mont on Google Drive
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/HPO/

Mounted at /content/drive
/content/drive/MyDrive/HPO


In [None]:
# @title Install and import packages
! pip install dateutil
! pip install lightgbm
! pip install optuna
import pandas as pd
import dateutil
import datetime
import optuna
from tqdm import tqdm, trange
from lightgbm import LGBMRegressor
import sklearn
import math
import time

In [None]:
# @title Read-in data and check data type and volume
df = pd.read_csv('./exp_data.csv')
df.info()

In [None]:
# @title Change into datatime type
df[["date"]] = df[["date"]].apply(pd.to_datetime)

In [None]:
# @title Configuration for experiements
library_evaluation_df = {
    'Library' : [],
    'Train Start Date': [],
    'Train End Date': [],
    'Test Start Date': [],
    'Test End Date': [],
    'Smallest RMSE': [],
    'Time Ellipsed': []
}

n_trials = 15 # @param run 15 times of hyper-param tuning

train_timespan_months = 180 # 5 years
whole_period_months = 60
test_timespan_months = 1

first_end_time = datetime.datetime(2015, 12, 1)
feat_cols = ['absacc', 'acc', 'age', 'agr', 'baspread','bm', 'bm_ia',
             'cash', 'cashdebt', 'cashpr', 'cfp', 'cfp_ia', 'chatoia', 'chcsho', 'chempia', 'chinv', 'chmom',
             'chpmia', 'chtx', 'cinvest', 'convind', 'currat', 'depr', 'divi', 'divo', 'dolvol', 'dy', 
             'egr', 'ep', 'gma', 'grcapx', 'grltnoa', 'herf', 'hire', 'ill', 'indmom', 'invest', 'lev', 'lgr',
             'maxret', 'mom12m', 'mom1m', 'mom36m', 'mom6m', 'ms', 'mve_ia', 'mvel1', 'nincr', 'operprof',
             'orgcap', 'pchcapx_ia', 'pchcurrat', 'pchdepr', 'pchgm_pchsale', 'pchquick', 'pchsale_pchinvt',
             'pchsale_pchrect', 'pchsale_pchxsga', 'pchsaleinv', 'pctacc', 'ps', 'quick', 'rd', 'rd_mve',
             'rd_sale', 'realestate', 'retvol', 'roaq', 'roavol', 'roeq', 'roic', 'rsup', 'salecash', 'pricedelay',
             'saleinv', 'salerec', 'secured', 'securedind', 'sgr', 'sin', 'sp', 'std_dolvol', 'std_turn',
             'stdacc', 'stdcf', 'tang', 'tb', 'turn', 'zerotrade','aeavol','ear','beta','betasq','idiovol']
y_col = 'ret'
train_end_date = first_end_time
time_usage = []
score_list = []
timeline = []

# Evaluation details for each train and test timespan
evaluate_detail_df = {
    'Train Start Date': [],
    'Train End Date': [],
    'Test Start Date': [],
    'Test End Date': [],
    'Smallest RMSE': [],
    'Time Ellipsed': []
}

In [None]:
# @title Back-test for 5 years
predict_times = 60
for period_time in trange(predict_times):
    train_start_date = train_end_date - dateutil.relativedelta.relativedelta(months=train_timespan_months)
    test_end_date = train_end_date + dateutil.relativedelta.relativedelta(months=test_timespan_months)
    print(train_start_date, train_end_date, test_end_date)
    train_data = df.query(f'"{train_start_date}" < date <= "{train_end_date}"')
    test_data = df.query(f'"{train_end_date}" < date <= "{test_end_date}"')
    X_train = train_data[feat_cols].values
    y_train = train_data[y_col].values
    X_test = test_data[feat_cols].values
    y_test = test_data[y_col].values.ravel()
    study = optuna.create_study(sampler=optuna.samplers.TPESampler())  # Create a new study.
    def objective(trial):
        param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),   
        'num_leaves': trial.suggest_int('num_leaves', 10, 512),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 80),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.0, 1.0), # subsample
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),  # eta
        'lambda_l1': trial.suggest_float('lambda_l1', 0.01, 1),  # reg_alpha
        'lambda_l2': trial.suggest_float('lambda_l2', 0.01, 1), # reg_lambda
        }
        model = LGBMRegressor(seed=42, **param)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = sklearn.metrics.mean_squared_error(y_test, y_pred)
        rmse = math.sqrt(mse)
        return rmse  # An objective value linked with the Trial object.
    ts = time.time()
    study.optimize(objective, n_trials=n_trials)  # Invoke optimization of the objective function.
    te = time.time()
    exc_time = te-ts
    evaluate_detail_df['Smallest RMSE'].append(study.best_value)
    evaluate_detail_df['Time Ellipsed'].append(exc_time)
    evaluate_detail_df['Train Start Date'].append(train_start_date)
    evaluate_detail_df['Train End Date'].append(train_end_date)
    evaluate_detail_df['Test Start Date'].append(train_end_date+dateutil.relativedelta.relativedelta(months=1))
    evaluate_detail_df['Test End Date'].append(test_end_date)
    train_end_date += dateutil.relativedelta.relativedelta(months=1)
evaluate_detail_df = pd.DataFrame(evaluate_detail_df)

In [None]:
evaluate_detail_df

Unnamed: 0,Train Start Date,Train End Date,Test Start Date,Test End Date,Smallest RMSE,Time Ellipsed
0,2000-12-01,2015-12-01,2016-01-01,2016-01-01,0.11888,800.256522
1,2001-01-01,2016-01-01,2016-02-01,2016-02-01,0.067029,499.085326
2,2001-02-01,2016-02-01,2016-03-01,2016-03-01,0.091569,536.5597
3,2001-03-01,2016-03-01,2016-04-01,2016-04-01,0.072515,502.082866
4,2001-04-01,2016-04-01,2016-05-01,2016-05-01,0.055176,806.044957
5,2001-05-01,2016-05-01,2016-06-01,2016-06-01,0.066578,724.420032
6,2001-06-01,2016-06-01,2016-07-01,2016-07-01,0.065813,418.449212
7,2001-07-01,2016-07-01,2016-08-01,2016-08-01,0.063775,621.808052
8,2001-08-01,2016-08-01,2016-09-01,2016-09-01,0.051464,479.586529
9,2001-09-01,2016-09-01,2016-10-01,2016-10-01,0.07658,563.58672


In [None]:
library_evaluation_df['Library'].extend(['Optuna' for _ in range(len(evaluate_detail_df))])
library_evaluation_df['Train Start Date'].extend(evaluate_detail_df['Train Start Date'])
library_evaluation_df['Train End Date'].extend(evaluate_detail_df['Train End Date'])
library_evaluation_df['Test Start Date'].extend(evaluate_detail_df['Test Start Date'])
library_evaluation_df['Test End Date'].extend(evaluate_detail_df['Test End Date'])
library_evaluation_df['Smallest RMSE'].extend(evaluate_detail_df['Smallest RMSE'])
library_evaluation_df['Time Ellipsed'].extend(evaluate_detail_df['Time Ellipsed'])

In [None]:
pd.DataFrame(library_evaluation_df)

Unnamed: 0,Library,Train Start Date,Train End Date,Test Start Date,Test End Date,Smallest RMSE,Time Ellipsed
0,Optuna,2000-12-01,2015-12-01,2016-01-01,2016-01-01,0.11888,800.256522
1,Optuna,2001-01-01,2016-01-01,2016-02-01,2016-02-01,0.067029,499.085326
2,Optuna,2001-02-01,2016-02-01,2016-03-01,2016-03-01,0.091569,536.5597
3,Optuna,2001-03-01,2016-03-01,2016-04-01,2016-04-01,0.072515,502.082866
4,Optuna,2001-04-01,2016-04-01,2016-05-01,2016-05-01,0.055176,806.044957
5,Optuna,2001-05-01,2016-05-01,2016-06-01,2016-06-01,0.066578,724.420032
6,Optuna,2001-06-01,2016-06-01,2016-07-01,2016-07-01,0.065813,418.449212
7,Optuna,2001-07-01,2016-07-01,2016-08-01,2016-08-01,0.063775,621.808052
8,Optuna,2001-08-01,2016-08-01,2016-09-01,2016-09-01,0.051464,479.586529
9,Optuna,2001-09-01,2016-09-01,2016-10-01,2016-10-01,0.07658,563.58672
