<a href="https://colab.research.google.com/github/WideSu/Python-for-DS/blob/main/Sklearn_HyperParam_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

TO-DO
- [x] Test the average time usage and RMSE for each epoch using scikit-learn random search
- [ ] Test TPE hyper param tuning for HyperOpt, Ray, Optuna
- [ ] Plot the RMSE through timeline
- [ ] Use the different sampler in Optuna: Random,TPE,CMA-ES,NSGA-II

The outcome:
- A chart consisting the average RMSE and excuation time for all hyper parameter tunning methods

|HPO Package                                  |Avg RMSE                        |Avg Time Ellipsed                                            |
|---------------------------------------------|--------------------------------|-------------------------------------------------------------|
|Scikit-learn                                 |                                |                                                             |
|HyperOpt                                     |                                |                                                             |
|Ray                                          |                                |                                                             |
|Optuna                                       |                                |                                                             |

|Library|Pros|Cons|Scenario|
|-|-|-|-|
|Scikit-learn|Flexible and basic|Only 2 basic methods (grid/random), New methods are not stable|Tradictional tuning|
|HyperOpt|High-speed and flexible,New search method: TPE/ATPE| Out-of-date interface |Time-limited|
|Ray|Systematic and well wrapped|Too customized and not flexible,Time-cost on initialization|Fast development and deployment with various tuning methods|
|Optuna|Well-performed and light;Include all popular and stable tuning methods |Not well wrapped for all methods|Accurate, flexible required|


In [None]:
# @title Mont on Google Drive
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/HPO/

Mounted at /content/drive
/content/drive/MyDrive/HPO


In [None]:
# @title Install and import packages
! pip install dateutil
! pip install lightgbm
! pip install optuna
import pandas as pd
import dateutil
import datetime
import optuna
from tqdm import tqdm, trange
from lightgbm import LGBMRegressor
import sklearn
import math
import time

In [None]:
# @title Read-in data and check data type and volume
df = pd.read_csv('./exp_data.csv')
df.info()

In [None]:
# @title Change into datatime type
df[["date"]] = df[["date"]].apply(pd.to_datetime)

In [None]:
# @title Configurations for experiements
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from scipy.stats import uniform,randint
# Configuration 
train_timespan_months = 180
whole_period_months = 60
test_timespan_months = 1
n_trials = 15
first_end_time = datetime.datetime(2015, 12, 1)
predict_times = 60
feat_cols = ['absacc', 'acc', 'age', 'agr', 'baspread','bm', 'bm_ia',
             'cash', 'cashdebt', 'cashpr', 'cfp', 'cfp_ia', 'chatoia', 'chcsho', 'chempia', 'chinv', 'chmom',
             'chpmia', 'chtx', 'cinvest', 'convind', 'currat', 'depr', 'divi', 'divo', 'dolvol', 'dy', 
             'egr', 'ep', 'gma', 'grcapx', 'grltnoa', 'herf', 'hire', 'ill', 'indmom', 'invest', 'lev', 'lgr',
             'maxret', 'mom12m', 'mom1m', 'mom36m', 'mom6m', 'ms', 'mve_ia', 'mvel1', 'nincr', 'operprof',
             'orgcap', 'pchcapx_ia', 'pchcurrat', 'pchdepr', 'pchgm_pchsale', 'pchquick', 'pchsale_pchinvt',
             'pchsale_pchrect', 'pchsale_pchxsga', 'pchsaleinv', 'pctacc', 'ps', 'quick', 'rd', 'rd_mve',
             'rd_sale', 'realestate', 'retvol', 'roaq', 'roavol', 'roeq', 'roic', 'rsup', 'salecash', 'pricedelay',
             'saleinv', 'salerec', 'secured', 'securedind', 'sgr', 'sin', 'sp', 'std_dolvol', 'std_turn',
             'stdacc', 'stdcf', 'tang', 'tb', 'turn', 'zerotrade','aeavol','ear','beta','betasq','idiovol']
y_col = 'ret'
train_end_date = first_end_time

# Evaluation details for each train and test timespan
evaluate_detail_df = {
    'Train Start Date': [],
    'Train End Date': [],
    'Test Start Date': [],
    'Test End Date': [],
    'Smallest RMSE': [],
    'Time Ellipsed': []
}

# Scikit-learn

Refered to [Scikit-learn RandomnizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)

In [None]:
def rmse_score(y_true, y_pred):
    rmse = sklearn.metrics.mean_squared_error(y_true, y_pred, squared = False)      
    return rmse
my_scorer = make_scorer(rmse_score, greater_is_better=False)
for period_time in trange(predict_times):
    train_start_date = train_end_date - dateutil.relativedelta.relativedelta(months=train_timespan_months)
    test_end_date = train_end_date + dateutil.relativedelta.relativedelta(months=test_timespan_months)
    # print(train_start_date, train_end_date, test_end_date)
    train_data = df.query(f'"{train_start_date}" < date <= "{train_end_date}"')
    test_data = df.query(f'"{train_end_date}" < date <= "{test_end_date}"')
    X_train = train_data[feat_cols].values
    y_train = train_data[y_col].values
    X_test = test_data[feat_cols].values
    y_test = test_data[y_col].values.ravel()
    # print(X_train.shape, y_train.shape)
    model = LGBMRegressor(seed=42)
    param_distribution = dict(
        n_estimators = randint(low=50, high=500),   
        num_leaves = randint(low=10, high=512),
        min_data_in_leaf = randint( low=10, high=80),
        bagging_fraction= uniform( loc=0, scale=0.1), # subsample
        learning_rate= uniform( loc=0.01, scale=0.09),  # eta
        lambda_l1= uniform( loc=0.01, scale=0.99),  # reg_alpha
        lambda_l2= uniform( loc=0.01, scale=0.99), # reg_lambda
    )
    search_cv = RandomizedSearchCV(model, 
                                   param_distribution,
                                   scoring=my_scorer,
                                   random_state=0,
                                   n_iter = n_trials)
    # Calc the search time
    ts = time.time()
    search_cv.fit(X_train, y_train)
    te = time.time()
    exc_time = te-ts
    evaluate_detail_df['Smallest RMSE'].append(b.best_score_)
    evaluate_detail_df['Time Ellipsed'].append(exc_time)
    evaluate_detail_df['Train Start Date'].append(train_start_date)
    evaluate_detail_df['Train End Date'].append(train_end_date)
    evaluate_detail_df['Test Start Date'].append(train_end_date+dateutil.relativedelta.relativedelta(months=1))
    evaluate_detail_df['Test End Date'].append(test_end_date)
    train_end_date += dateutil.relativedelta.relativedelta(months=1)
evaluate_detail_df = pd.DataFrame(evaluate_detail_df)

  0%|          | 0/60 [00:00<?, ?it/s]

In [None]:
library_evaluation_df['Library'].extend(['Optuna' for _ in range(len(evaluate_detail_df))])
library_evaluation_df['Train Start Date'].extend(evaluate_detail_df['Train Start Date'])
library_evaluation_df['Train End Date'].extend(evaluate_detail_df['Train End Date'])
library_evaluation_df['Test Start Date'].extend(evaluate_detail_df['Test Start Date'])
library_evaluation_df['Test End Date'].extend(evaluate_detail_df['Test End Date'])
library_evaluation_df['Smallest RMSE'].extend(evaluate_detail_df['Smallest RMSE'])
library_evaluation_df['Time Ellipsed'].extend(evaluate_detail_df['Time Ellipsed'])
pd.DataFrame(library_evaluation_df)