In [21]:
import optuna
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from xgboost.sklearn import XGBRegressor
import pandas as pd
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor as RF
# from lightgbm import LGBMRegressor
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
import global_config as cfg

In [22]:
model_enum = ['XGB', 'LGBM', 'SVM', 'RF']
model_type = model_enum[2]

In [23]:
def get_X_Y(csvfile):
    df = pd.read_csv(csvfile)
    # 去除含有无效值的列
    df = df[~df.isin([np.nan, np.inf, -np.inf]).any(1)]
    df = df.dropna(axis=0, how='any')

    X = df.drop(['SMILES', 'Blood', 'Brain', 'Ratio'], axis=1)
    X = MinMaxScaler().fit_transform(X)
    # print(len(X))
    blood_y = df['Blood'].ravel()
    brain_y = df['Brain'].ravel()
    ratio_y = df['Ratio'].ravel()
    SMILES = df['SMILES']
    return pd.DataFrame(X).astype('float64'), blood_y, brain_y, ratio_y, SMILES

In [24]:
def objective(trial, X, y):
    # XGB Params
    if model_type == model_enum[0]:
        param_grid = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 3000, step=50),
            "learning_rate": trial.suggest_categorical('learning_rate',
                                                    [0.005, 0.008, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02, 0.023, 0.025, 0.028, 0.03]),
            "max_depth": trial.suggest_int("max_depth", 0, 30),
            'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
            'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 30),
            'gamma': trial.suggest_int("gamma", 0, 20, step=1),
            'colsample_bytree': trial.suggest_float("colsample_bytree", 0, 1, step=0.1),
            'colsample_bylevel': trial.suggest_float("colsample_bylevel", 0, 1, step=0.1),
            'colsample_bynode': trial.suggest_float("colsample_bynode", 0, 1, step=0.1),
        }
    # LGBM params
    elif model_type == model_enum[1]:
        param_grid = {
            "boosting_type": trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
            "max_depth": trial.suggest_int("max_depth", 1, 30),
            "learning_rate": trial.suggest_categorical('learning_rate',
                                                    [0.005, 0.008, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02, 0.023, 0.025, 0.028, 0.03]),
            "n_estimators": trial.suggest_int("n_estimators", 50, 3000, step=50),
            "objective": trial.suggest_categorical('objective', ['regression']),
            "min_child_samples": trial.suggest_int("min_child_samples", 5, 30),
            # 'colsample_bytree': trial.suggest_float("colsample_bytree", 0, 1, step=0.1),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
            # 'feature_fraction': trial.suggest_categorical('feature_fraction', [0.5])
            # 'verbose': trial.suggest_categorical('verbose', [-1])
        }
    elif model_type == model_enum[2]:   #SVM
        param_grid = {
            "C": trial.suggest_categorical('C', [0.1, 1, 10]),
            'gamma': trial.suggest_categorical("gamma", ['scale', 'auto']),
            'tol': trial.suggest_categorical("tol", [1e-2, 1e-3, 1e-4]),
            'max_iter': trial.suggest_categorical("max_iter", [1000, 5000, 10000]),
            'epsilon': trial.suggest_float("epsilon", 0.1, 1.0)
        }
    elif model_type == model_enum[3]:   #RF
        param_grid = {
            "boosting_type": trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
            "max_depth": trial.suggest_int("max_depth", 1, 30),
            "learning_rate": trial.suggest_categorical('learning_rate',
                                                    [0.005, 0.008, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02, 0.023, 0.025, 0.028, 0.03]),
            "n_estimators": trial.suggest_int("n_estimators", 50, 3000, step=50),
            "objective": trial.suggest_categorical('objective', ['regression']),
            "min_child_samples": trial.suggest_int("min_child_samples", 5, 30),
            # 'colsample_bytree': trial.suggest_float("colsample_bytree", 0, 1, step=0.1),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
            # 'feature_fraction': trial.suggest_categorical('feature_fraction', [0.5])
            # 'verbose': trial.suggest_categorical('verbose', [-1])
        }

    cv = KFold(n_splits=5, shuffle=True)
    
    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        if model_type == model_enum[0]:
            model = XGBRegressor(**param_grid)
            model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=False)
        elif model_type == model_enum[1]:
            model = lgb.sklearn.LGBMRegressor(**param_grid)
            # callbacks = [lgb.early_stopping(100, verbose=0), lgb.log_evaluation(period=0)]
            callbacks = [lgb.log_evaluation(period=0)]
            model.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=callbacks)
        elif model_type == model_enum[2]:   #SVM
            model = SVR(**param_grid)
            model.fit(X_train, y_train)
        elif model_type == model_enum[3]:   #RF
            model = RF(**param_grid)
            model.fit(X_train, y_train)
        preds = model.predict(X_test)
        # cv_scores[idx] = r2_score(y_test, preds)
        cv_scores[idx] = np.sqrt(mean_squared_error(y_test, preds))

    return np.mean(cv_scores)

In [25]:
csvfile = cfg.desc_csvfilepath
n_trials=30

X, blood_y, brain_y, ratio_y, _ = get_X_Y(csvfile)
blood_X = X.iloc[:, cfg.blood_fea]
brain_X = X.iloc[:, cfg.brain_fea]

blood_study = optuna.create_study(direction="minimize", study_name="Blood_Regressor")
func = lambda trial: objective(trial, blood_X, blood_y)
blood_study.optimize(func, n_trials=n_trials)

brain_study = optuna.create_study(direction="minimize", study_name="Brain_Regressor")
func = lambda trial: objective(trial, brain_X, brain_y)
brain_study.optimize(func, n_trials=n_trials)


[32m[I 2023-01-01 17:38:05,829][0m A new study created in memory with name: Blood_Regressor[0m
[32m[I 2023-01-01 17:38:05,971][0m Trial 0 finished with value: 5.397350960955449 and parameters: {'C': 10, 'gamma': 'auto', 'tol': 0.001, 'max_iter': 1000, 'epsilon': 0.8792766446967049}. Best is trial 0 with value: 5.397350960955449.[0m
[32m[I 2023-01-01 17:38:06,135][0m Trial 1 finished with value: 5.340685303313385 and parameters: {'C': 0.1, 'gamma': 'scale', 'tol': 0.001, 'max_iter': 100, 'epsilon': 0.39994117099516024}. Best is trial 1 with value: 5.340685303313385.[0m
[32m[I 2023-01-01 17:38:06,277][0m Trial 2 finished with value: 4.849072897775483 and parameters: {'C': 1, 'gamma': 'scale', 'tol': 0.0001, 'max_iter': 100, 'epsilon': 0.14960142945693464}. Best is trial 2 with value: 4.849072897775483.[0m
[32m[I 2023-01-01 17:38:06,412][0m Trial 3 finished with value: 5.49688746061282 and parameters: {'C': 10, 'gamma': 'scale', 'tol': 0.0001, 'max_iter': 10000, 'epsilon': 0

KeyboardInterrupt: 

In [None]:
print("Blood tuning result:")
print(f"Best value: {blood_study.best_value:.5f}")
print(f"Best params:")

for key, value in blood_study.best_params.items():
    print(f"\t\'{key}\': {value},")

print("Brain tuning result:")
print(f"Best value: {brain_study.best_value:.5f}")
print(f"Best params:")

for key, value in brain_study.best_params.items():
    print(f"\t\'{key}\': {value},")


Blood tuning result:
Best value: 3.50917
Best params:
	'n_estimators': 1950,
	'learning_rate': 0.014,
	'max_depth': 18,
	'lambda': 8.645496158267079,
	'alpha': 0.45639661861114994,
	'min_child_weight': 1,
	'gamma': 9,
	'colsample_bytree': 0.30000000000000004,
	'colsample_bylevel': 0.30000000000000004,
	'colsample_bynode': 0.9,
Brain tuning result:
Best value: 25.20963
Best params:
	'n_estimators': 1550,
	'learning_rate': 0.018,
	'max_depth': 24,
	'lambda': 0.07683686528439758,
	'alpha': 0.008538159369120378,
	'min_child_weight': 16,
	'gamma': 8,
	'colsample_bytree': 0.4,
	'colsample_bylevel': 0.8,
	'colsample_bynode': 0.4,
