In [1]:
import optuna
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from xgboost.sklearn import XGBRegressor
import pandas as pd
# from lightgbm import LGBMRegressor
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_X_Y(csvfile):
    df = pd.read_csv(csvfile)
    X = df.drop(['SMILES', 'Blood', 'Brain', 'Ratio'], axis=1)
    X = MinMaxScaler().fit_transform(X)
    # print(len(X))
    blood_y = df['Blood'].ravel()
    brain_y = df['Brain'].ravel()
    ratio_y = df['Ratio'].ravel()
    SMILES = df['SMILES']
    return pd.DataFrame(X), blood_y, brain_y, ratio_y, SMILES

In [3]:
def objective(trial, X, y):
    # XGB Params
    # param_grid = {
    #     "n_estimators": trial.suggest_int("n_estimators", 50, 3000, step=50),
    #     "learning_rate": trial.suggest_categorical('learning_rate',
    #                                                [0.005, 0.008, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02, 0.023, 0.025, 0.028, 0.03]),
    #     "max_depth": trial.suggest_int("max_depth", 0, 30),
    #     'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
    #     'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
    #     "min_child_weight": trial.suggest_int("min_child_weight", 1, 30),
    #     'gamma': trial.suggest_int("gamma", 0, 20, step=1),
    #     'colsample_bytree': trial.suggest_float("colsample_bytree", 0, 1, step=0.1),
    #     'colsample_bylevel': trial.suggest_float("colsample_bylevel", 0, 1, step=0.1),
    #     'colsample_bynode': trial.suggest_float("colsample_bynode", 0, 1, step=0.1),
    # }
    # LGBM params
    param_grid = {
        "boosting_type": trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
        "max_depth": trial.suggest_int("max_depth", 1, 30),
        "learning_rate": trial.suggest_categorical('learning_rate',
                                                   [0.005, 0.008, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02, 0.023, 0.025, 0.028, 0.03]),
        "n_estimators": trial.suggest_int("n_estimators", 50, 3000, step=50),
        "objective": trial.suggest_categorical('objective', ['regression']),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 30),
        # 'colsample_bytree': trial.suggest_float("colsample_bytree", 0, 1, step=0.1),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        # 'feature_fraction': trial.suggest_categorical('feature_fraction', [0.5])
        # 'verbose': trial.suggest_categorical('verbose', [-1])
    }

    cv = KFold(n_splits=5, shuffle=True)
    
    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # model = XGBRegressor(**param_grid)
        # model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=False)
        model = lgb.sklearn.LGBMRegressor(**param_grid)
        # callbacks = [lgb.early_stopping(100, verbose=0), lgb.log_evaluation(period=0)]
        callbacks = [lgb.log_evaluation(period=0)]
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=callbacks)
        preds = model.predict(X_test)
        cv_scores[idx] = r2_score(y_test, preds)

    return np.mean(cv_scores)

In [4]:
filetime = "20221221"
csvfile = f"./result/{filetime}/RatioDescriptors.csv"
n_trials=30

X, blood_y, brain_y, ratio_y, _ = get_X_Y(csvfile)
blood_fea = [162, 222, 254, 255, 261, 300, 320, 325, 338, 369, 396, 441, 446, 474, 481, 489, 502, 514, 529, 530, 541, 549, 565, 568, 570, 582, 594, 598, 602, 631, 632, 638, 645, 646, 648, 802, 807, 832, 986, 1145, 1226, 1232, 1266, 1287, 1289, 1297, 1316, 1356, 1539, 1544]
brain_fea = [3, 40, 150, 164, 243, 246, 254, 255, 261, 310, 342, 368, 369, 449, 450, 458, 497, 506, 529, 542, 549, 578, 602, 604, 610, 618, 637, 642, 644, 646, 770, 781, 801, 814, 846, 986, 999, 1065, 1078, 1136, 1143, 1157, 1278, 1316, 1329, 1330, 1336, 1543, 1545, 1547]
blood_X = X.iloc[:, blood_fea]
brain_X = X.iloc[:, brain_fea]

blood_study = optuna.create_study(direction="maximize", study_name="Blood_Regressor")
func = lambda trial: objective(trial, blood_X, blood_y)
blood_study.optimize(func, n_trials=n_trials)

brain_study = optuna.create_study(direction="maximize", study_name="Brain_Regressor")
func = lambda trial: objective(trial, brain_X, brain_y)
brain_study.optimize(func, n_trials=n_trials)


[32m[I 2022-12-30 16:44:04,402][0m A new study created in memory with name: Blood_Regressor[0m
[32m[I 2022-12-30 16:44:05,682][0m Trial 0 finished with value: 0.26246424547614017 and parameters: {'boosting_type': 'gbdt', 'max_depth': 9, 'learning_rate': 0.028, 'n_estimators': 200, 'objective': 'regression', 'min_child_samples': 25, 'reg_lambda': 0.4610443495472666, 'reg_alpha': 0.9451403456552597}. Best is trial 0 with value: 0.26246424547614017.[0m
[32m[I 2022-12-30 16:45:13,605][0m Trial 1 finished with value: 0.26055649237261125 and parameters: {'boosting_type': 'dart', 'max_depth': 7, 'learning_rate': 0.005, 'n_estimators': 2950, 'objective': 'regression', 'min_child_samples': 7, 'reg_lambda': 4.624125493327024, 'reg_alpha': 2.160826177629117}. Best is trial 0 with value: 0.26246424547614017.[0m
[32m[I 2022-12-30 16:45:15,225][0m Trial 2 finished with value: 0.16085726900611713 and parameters: {'boosting_type': 'gbdt', 'max_depth': 7, 'learning_rate': 0.005, 'n_estimator

In [5]:
print("Blood tuning result:")
print(f"Best value: {blood_study.best_value:.5f}")
print(f"Best params:")

for key, value in blood_study.best_params.items():
    print(f"\t\'{key}\': {value},")

print("Brain tuning result:")
print(f"Best value: {brain_study.best_value:.5f}")
print(f"Best params:")

for key, value in brain_study.best_params.items():
    print(f"\t\'{key}\': {value},")
"""
Best value: 0.60784
Best params:
	'n_estimators': 1300,
	'learning_rate': 0.016,
	'max_depth': 21,
	'lambda': 0.34332326291020665,
	'alpha': 0.8596755985778055,
	'min_child_weight': 20,
	'gamma': 10,
	'colsample_bytree': 0.9,
	'colsample_bylevel': 1.0,
	'colsample_bynode': 0.6,
"""

Blood tuning result:
Best value: 0.50033
Best params:
	'boosting_type': dart,
	'max_depth': 19,
	'learning_rate': 0.016,
	'n_estimators': 2700,
	'objective': regression,
	'min_child_samples': 10,
	'reg_lambda': 0.005293813028263606,
	'reg_alpha': 0.3736524905316094,
Brain tuning result:
Best value: 0.56152
Best params:
	'boosting_type': gbdt,
	'max_depth': 16,
	'learning_rate': 0.025,
	'n_estimators': 2550,
	'objective': regression,
	'min_child_samples': 28,
	'reg_lambda': 0.08538775926146094,
	'reg_alpha': 0.07632249600835359,


"\nBest value: 0.60784\nBest params:\n\t'n_estimators': 1300,\n\t'learning_rate': 0.016,\n\t'max_depth': 21,\n\t'lambda': 0.34332326291020665,\n\t'alpha': 0.8596755985778055,\n\t'min_child_weight': 20,\n\t'gamma': 10,\n\t'colsample_bytree': 0.9,\n\t'colsample_bylevel': 1.0,\n\t'colsample_bynode': 0.6,\n"