In [19]:
import optuna
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from xgboost.sklearn import XGBRegressor
import pandas as pd
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.neural_network import MLPRegressor as MLP
# from lightgbm import LGBMRegressor
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
import global_config as cfg
from data_preprocess import get_X_Y

In [20]:
model_enum = cfg.model_enum
model_type = model_enum[2]

In [21]:
# def get_X_Y(csvfile):
#     df = pd.read_csv(csvfile)
#     # 去除含有无效值的列
#     df = df[~df.isin([np.nan, np.inf, -np.inf]).any(1)]
#     df = df.dropna(axis=0, how='any')

#     X = df.drop(['SMILES', 'Blood', 'Brain', 'Ratio'], axis=1)
#     X = MinMaxScaler().fit_transform(X)
#     # print(len(X))
#     blood_y = df['Blood'].ravel()
#     brain_y = df['Brain'].ravel()
#     ratio_y = df['Ratio'].ravel()
#     SMILES = df['SMILES']
#     return pd.DataFrame(X).astype('float64'), blood_y, brain_y, ratio_y, SMILES

In [22]:
def objective(trial, X, y):
    # XGB Params
    if model_type == model_enum[0]:
        param_grid = {
            # 'n_estimators': 600,
	        # 'learning_rate': 0.013,
            # 'max_depth': 22,
            # 'lambda': 0.003340201697365462,
            # 'alpha': 0.001044793853811272,
            # 'min_child_weight': 8,
	        # 'gamma': 0,
            # 'colsample_bytree': 1.0,
            # 'colsample_bylevel': 0.3,
            # 'colsample_bynode': 0.6,

            "n_estimators": trial.suggest_int("n_estimators", 50, 3000, step=50),
            "learning_rate": trial.suggest_float('learning_rate', 0.005, 0.03, step=0.001),
            "max_depth": trial.suggest_int("max_depth", 0, 30),
            'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
            'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 30),
            'gamma': trial.suggest_int("gamma", 0, 20, step=1),
            'colsample_bytree': trial.suggest_float("colsample_bytree", 0, 1, step=0.1),
            'colsample_bylevel': trial.suggest_float("colsample_bylevel", 0, 1, step=0.1),
            'colsample_bynode': trial.suggest_float("colsample_bynode", 0, 1, step=0.1),
        }
    # LGBM params
    elif model_type == model_enum[1]:
        param_grid = {
            "boosting_type": trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
            "max_depth": trial.suggest_int("max_depth", 1, 30),
            "learning_rate": trial.suggest_categorical('learning_rate',
                                                    [0.005, 0.008, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02, 0.023, 0.025, 0.028, 0.03]),
            "n_estimators": trial.suggest_int("n_estimators", 50, 3000, step=50),
            "objective": trial.suggest_categorical('objective', ['regression']),
            "min_child_samples": trial.suggest_int("min_child_samples", 5, 30),
            # 'colsample_bytree': trial.suggest_float("colsample_bytree", 0, 1, step=0.1),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
            # 'feature_fraction': trial.suggest_categorical('feature_fraction', [0.5])
            # 'verbose': trial.suggest_categorical('verbose', [-1])
        }
    elif model_type == model_enum[2]:   #SVM
        param_grid = {
            "C": trial.suggest_float('C', 0.1, 10),
            'gamma': trial.suggest_categorical("gamma", ['scale', 'auto']),
            'tol': trial.suggest_categorical("tol", [1e-2, 1e-3, 1e-4]),
            'max_iter': trial.suggest_categorical("max_iter", [1000, 5000, 10000]),
            'epsilon': trial.suggest_float("epsilon", 0.1, 1.0)
        }
    elif model_type == model_enum[3]:   #RF
        param_grid = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 3000, step=100),
            "max_depth": trial.suggest_int("max_depth", 1, 30),
        }
    elif model_type == model_enum[4]:   #MLP
        param_grid = {
            "hidden_layer_sizes": trial.suggest_categorical("hidden_layer_sizes", [(50,), (100,), (150,), (200,)]),
            "activation": trial.suggest_categorical("activation", ['tanh', 'relu']),
            "solver": trial.suggest_categorical("solver", ['lbfgs', 'sgd', 'adam']),
            "early_stopping": True,
            "max_iter": trial.suggest_int("max_iter", 200, 1000, step=100),
        }

    cv = KFold(n_splits=5, shuffle=True)
    
    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        if model_type == model_enum[0]:
            model = XGBRegressor(**param_grid)
            model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=False)
        elif model_type == model_enum[1]:
            model = lgb.sklearn.LGBMRegressor(**param_grid)
            # callbacks = [lgb.early_stopping(100, verbose=0), lgb.log_evaluation(period=0)]
            callbacks = [lgb.log_evaluation(period=0)]
            model.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=callbacks)
        elif model_type == model_enum[2]:   #SVM
            model = SVR(**param_grid)
            model.fit(X_train, y_train)
        elif model_type == model_enum[3]:   #RF
            model = RF(**param_grid)
            model.fit(X_train, y_train)
        elif model_type == model_enum[4]:   #MLP
            model = MLP(**param_grid)
            model.fit(X_train, y_train)
        preds = model.predict(X_test)
        cv_scores[idx] = r2_score(y_test, preds)
        # cv_scores[idx] = np.sqrt(mean_squared_error(y_test, preds)) / (y_test.max() - y_test.min())
        # cv_scores[idx] = np.sqrt(mean_squared_error(y_test, preds))

    return np.mean(cv_scores)

In [23]:
csvfile = cfg.padel_csvfilepath
n_trials = 50

X, blood_y, brain_y, ratio_y, _ = get_X_Y(csvfile)
blood_X = X.iloc[:, cfg.blood_fea]
brain_X = X.iloc[:, cfg.brain_fea]
ratio_X = X.iloc[:, cfg.X_fea]
print(model_type)
directions = ["minimize", "maximize"]
direction = directions[1]

  after removing the cwd from sys.path.


(411, 9196)
(411, 3576)
XGB


In [24]:
blood_study = optuna.create_study(direction=direction, study_name="Blood_Regressor")
func = lambda trial: objective(trial, blood_X, blood_y)
blood_study.optimize(func, n_trials=n_trials)

[32m[I 2023-01-11 19:12:54,601][0m A new study created in memory with name: Blood_Regressor[0m
[32m[I 2023-01-11 19:13:01,486][0m Trial 0 finished with value: 0.27630162045114026 and parameters: {'n_estimators': 1950, 'learning_rate': 0.024, 'max_depth': 19, 'lambda': 0.05725171901943744, 'alpha': 3.1686920758281847, 'min_child_weight': 9, 'gamma': 18, 'colsample_bytree': 0.9, 'colsample_bylevel': 0.0, 'colsample_bynode': 1.0}. Best is trial 0 with value: 0.27630162045114026.[0m
[32m[I 2023-01-11 19:13:04,413][0m Trial 1 finished with value: 0.18723647099807766 and parameters: {'n_estimators': 450, 'learning_rate': 0.006, 'max_depth': 9, 'lambda': 0.47617410671793725, 'alpha': 7.296105810765127, 'min_child_weight': 10, 'gamma': 15, 'colsample_bytree': 0.7000000000000001, 'colsample_bylevel': 0.9, 'colsample_bynode': 1.0}. Best is trial 0 with value: 0.27630162045114026.[0m
[32m[I 2023-01-11 19:13:05,693][0m Trial 2 finished with value: 0.1173868147526754 and parameters: {'n_

In [25]:
print("Blood tuning result:")
print(f"Best value: {blood_study.best_value:.5f}")
print(f"Best params:")

for key, value in blood_study.best_params.items():
    print(f"\t\'{key}\': {value},")

Blood tuning result:
Best value: 0.56883
Best params:
	'n_estimators': 1700,
	'learning_rate': 0.026000000000000002,
	'max_depth': 26,
	'lambda': 0.0022106369528429484,
	'alpha': 0.9133162515639958,
	'min_child_weight': 18,
	'gamma': 9,
	'colsample_bytree': 0.9,
	'colsample_bylevel': 0.6000000000000001,
	'colsample_bynode': 0.30000000000000004,


In [26]:
brain_study = optuna.create_study(direction=direction, study_name="Brain_Regressor")
func = lambda trial: objective(trial, brain_X, brain_y)
brain_study.optimize(func, n_trials=n_trials)

[32m[I 2023-01-11 19:18:55,373][0m A new study created in memory with name: Brain_Regressor[0m
[32m[I 2023-01-11 19:19:00,471][0m Trial 0 finished with value: 0.32978042639279626 and parameters: {'n_estimators': 650, 'learning_rate': 0.025, 'max_depth': 20, 'lambda': 0.0013889738498531737, 'alpha': 0.05943888448988878, 'min_child_weight': 3, 'gamma': 3, 'colsample_bytree': 0.7000000000000001, 'colsample_bylevel': 0.6000000000000001, 'colsample_bynode': 0.5}. Best is trial 0 with value: 0.32978042639279626.[0m
[32m[I 2023-01-11 19:19:13,459][0m Trial 1 finished with value: 0.4812016513535243 and parameters: {'n_estimators': 800, 'learning_rate': 0.009000000000000001, 'max_depth': 26, 'lambda': 0.0035178551496513983, 'alpha': 0.011806660160022906, 'min_child_weight': 1, 'gamma': 20, 'colsample_bytree': 0.7000000000000001, 'colsample_bylevel': 0.8, 'colsample_bynode': 0.8}. Best is trial 1 with value: 0.4812016513535243.[0m
[32m[I 2023-01-11 19:19:18,297][0m Trial 2 finished wi

In [27]:
print("Brain tuning result:")
print(f"Best value: {brain_study.best_value:.5f}")
print(f"Best params:")

for key, value in brain_study.best_params.items():
    print(f"\t\'{key}\': {value},")

Brain tuning result:
Best value: 0.60631
Best params:
	'n_estimators': 1450,
	'learning_rate': 0.013000000000000001,
	'max_depth': 11,
	'lambda': 0.1245428483067459,
	'alpha': 0.20833503659100544,
	'min_child_weight': 1,
	'gamma': 12,
	'colsample_bytree': 0.4,
	'colsample_bylevel': 0.1,
	'colsample_bynode': 0.4,


In [28]:
ratio_study = optuna.create_study(direction=direction, study_name="Ratio_Regressor")
func = lambda trial: objective(trial, ratio_X, ratio_y)
ratio_study.optimize(func, n_trials=n_trials)

[32m[I 2023-01-11 19:23:05,805][0m A new study created in memory with name: Ratio_Regressor[0m
[32m[I 2023-01-11 19:23:08,802][0m Trial 0 finished with value: -0.0034815830923528245 and parameters: {'n_estimators': 3000, 'learning_rate': 0.009000000000000001, 'max_depth': 0, 'lambda': 0.007632309746011552, 'alpha': 8.172215586698337, 'min_child_weight': 12, 'gamma': 2, 'colsample_bytree': 0.4, 'colsample_bylevel': 0.9, 'colsample_bynode': 0.5}. Best is trial 0 with value: -0.0034815830923528245.[0m
[32m[I 2023-01-11 19:23:09,553][0m Trial 1 finished with value: 0.2299909865012447 and parameters: {'n_estimators': 100, 'learning_rate': 0.026000000000000002, 'max_depth': 22, 'lambda': 0.36808734395247433, 'alpha': 0.10735847642067452, 'min_child_weight': 30, 'gamma': 16, 'colsample_bytree': 0.7000000000000001, 'colsample_bylevel': 0.2, 'colsample_bynode': 0.9}. Best is trial 1 with value: 0.2299909865012447.[0m
[32m[I 2023-01-11 19:23:13,464][0m Trial 2 finished with value: 0.3

In [29]:
print("Ratio tuning result:")
print(f"Best value: {ratio_study.best_value:.5f}")
print(f"Best params:")

for key, value in ratio_study.best_params.items():
    print(f"\t\'{key}\': {value},")

Ratio tuning result:
Best value: 0.38157
Best params:
	'n_estimators': 1400,
	'learning_rate': 0.018000000000000002,
	'max_depth': 23,
	'lambda': 5.599908028889678,
	'alpha': 8.503428132294024,
	'min_child_weight': 11,
	'gamma': 3,
	'colsample_bytree': 0.4,
	'colsample_bylevel': 1.0,
	'colsample_bynode': 0.30000000000000004,
