In [7]:
import optuna
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from xgboost.sklearn import XGBRegressor
import pandas as pd
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.neural_network import MLPRegressor as MLP
# from lightgbm import LGBMRegressor
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
import global_config as cfg

In [8]:
model_enum =cfg.model_enum
model_type = model_enum[4]

In [9]:
def get_X_Y(csvfile):
    df = pd.read_csv(csvfile)
    # 去除含有无效值的列
    df = df[~df.isin([np.nan, np.inf, -np.inf]).any(1)]
    df = df.dropna(axis=0, how='any')

    X = df.drop(['SMILES', 'Blood', 'Brain', 'Ratio'], axis=1)
    X = MinMaxScaler().fit_transform(X)
    # print(len(X))
    blood_y = df['Blood'].ravel()
    brain_y = df['Brain'].ravel()
    ratio_y = df['Ratio'].ravel()
    SMILES = df['SMILES']
    return pd.DataFrame(X).astype('float64'), blood_y, brain_y, ratio_y, SMILES

In [10]:
def objective(trial, X, y):
    # XGB Params
    if model_type == model_enum[0]:
        param_grid = {
            'n_estimators': 2200,
	        'learning_rate': 0.02,
            'max_depth': 5,
            'lambda': 0.010075781713370716,
            'alpha': 0.19312610292731117,
            'min_child_weight': 8,
	        'gamma': 17,
            'colsample_bytree': 1.0,
            'colsample_bylevel': 0.5,
            'colsample_bynode': 0.9,
            # "n_estimators": trial.suggest_int("n_estimators", 50, 3000, step=50),
            # "learning_rate": trial.suggest_float('learning_rate', 0.005, 0.03, step=0.001),
            # "max_depth": trial.suggest_int("max_depth", 0, 30),
            # 'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
            # 'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
            # "min_child_weight": trial.suggest_int("min_child_weight", 1, 30),
            # 'gamma': trial.suggest_int("gamma", 0, 20, step=1),
            # 'colsample_bytree': trial.suggest_float("colsample_bytree", 0, 1, step=0.1),
            # 'colsample_bylevel': trial.suggest_float("colsample_bylevel", 0, 1, step=0.1),
            # 'colsample_bynode': trial.suggest_float("colsample_bynode", 0, 1, step=0.1),
        }
    # LGBM params
    elif model_type == model_enum[1]:
        param_grid = {
            "boosting_type": trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
            "max_depth": trial.suggest_int("max_depth", 1, 30),
            "learning_rate": trial.suggest_categorical('learning_rate',
                                                    [0.005, 0.008, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02, 0.023, 0.025, 0.028, 0.03]),
            "n_estimators": trial.suggest_int("n_estimators", 50, 3000, step=50),
            "objective": trial.suggest_categorical('objective', ['regression']),
            "min_child_samples": trial.suggest_int("min_child_samples", 5, 30),
            # 'colsample_bytree': trial.suggest_float("colsample_bytree", 0, 1, step=0.1),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
            # 'feature_fraction': trial.suggest_categorical('feature_fraction', [0.5])
            # 'verbose': trial.suggest_categorical('verbose', [-1])
        }
    elif model_type == model_enum[2]:   #SVM
        param_grid = {
            "C": trial.suggest_categorical('C', [0.1, 1, 10]),
            'gamma': trial.suggest_categorical("gamma", ['scale', 'auto']),
            'tol': trial.suggest_categorical("tol", [1e-2, 1e-3, 1e-4]),
            'max_iter': trial.suggest_categorical("max_iter", [1000, 5000, 10000]),
            'epsilon': trial.suggest_float("epsilon", 0.1, 1.0)
        }
    elif model_type == model_enum[3]:   #RF
        param_grid = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 3000, step=100),
            "max_depth": trial.suggest_int("max_depth", 1, 30),
        }
    elif model_type == model_enum[4]:   #MLP
        param_grid = {
            "hidden_layer_sizes": trial.suggest_categorical("hidden_layer_sizes", [(50,), (100,), (150,), (200,)]),
            "activation": trial.suggest_categorical("activation", ['tanh', 'relu']),
            "solver": trial.suggest_categorical("solver", ['lbfgs', 'sgd', 'adam']),
            "early_stopping": True,
            "max_iter": trial.suggest_int("max_iter", 200, 1000, step=100),
        }

    cv = KFold(n_splits=5, shuffle=True)
    
    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        if model_type == model_enum[0]:
            model = XGBRegressor(**param_grid)
            model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=False)
        elif model_type == model_enum[1]:
            model = lgb.sklearn.LGBMRegressor(**param_grid)
            # callbacks = [lgb.early_stopping(100, verbose=0), lgb.log_evaluation(period=0)]
            callbacks = [lgb.log_evaluation(period=0)]
            model.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=callbacks)
        elif model_type == model_enum[2]:   #SVM
            model = SVR(**param_grid)
            model.fit(X_train, y_train)
        elif model_type == model_enum[3]:   #RF
            model = RF(**param_grid)
            model.fit(X_train, y_train)
        elif model_type == model_enum[4]:   #MLP
            model = MLP(**param_grid)
            model.fit(X_train, y_train)
        preds = model.predict(X_test)
        # cv_scores[idx] = r2_score(y_test, preds)
        cv_scores[idx] = np.sqrt(mean_squared_error(y_test, preds))

    return np.mean(cv_scores)

In [11]:
csvfile = cfg.ECCF_csvfilepath
n_trials=30

X, blood_y, brain_y, ratio_y, _ = get_X_Y(csvfile)
blood_X = X.iloc[:, cfg.blood_fea]
brain_X = X.iloc[:, cfg.brain_fea]

print(model_type)


MLP


In [12]:
blood_study = optuna.create_study(direction="minimize", study_name="Blood_Regressor")
func = lambda trial: objective(trial, blood_X, blood_y)
blood_study.optimize(func, n_trials=n_trials)

[32m[I 2023-01-05 00:45:26,190][0m A new study created in memory with name: Blood_Regressor[0m
[32m[I 2023-01-05 00:45:28,904][0m Trial 0 finished with value: 6.176820191749538 and parameters: {'hidden_layer_sizes': (150,), 'activation': 'tanh', 'solver': 'sgd', 'max_iter': 300}. Best is trial 0 with value: 6.176820191749538.[0m
[32m[I 2023-01-05 00:45:29,172][0m Trial 1 finished with value: 5.691635627688302 and parameters: {'hidden_layer_sizes': (150,), 'activation': 'relu', 'solver': 'sgd', 'max_iter': 200}. Best is trial 1 with value: 5.691635627688302.[0m
[32m[I 2023-01-05 00:45:31,150][0m Trial 2 finished with value: 5.5308087117134015 and parameters: {'hidden_layer_sizes': (150,), 'activation': 'relu', 'solver': 'adam', 'max_iter': 600}. Best is trial 2 with value: 5.5308087117134015.[0m
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.ht

Blood tuning result:
Best value: 3.78960
Best params:
	'hidden_layer_sizes': (50,),
	'activation': relu,
	'solver': lbfgs,
	'max_iter': 600,


In [None]:
print("Blood tuning result:")
print(f"Best value: {blood_study.best_value:.5f}")
print(f"Best params:")

for key, value in blood_study.best_params.items():
    print(f"\t\'{key}\': {value},")

In [13]:
brain_study = optuna.create_study(direction="minimize", study_name="Brain_Regressor")
func = lambda trial: objective(trial, brain_X, brain_y)
brain_study.optimize(func, n_trials=n_trials)


[32m[I 2023-01-05 00:48:01,283][0m A new study created in memory with name: Brain_Regressor[0m
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    ht

In [None]:
print("Brain tuning result:")
print(f"Best value: {brain_study.best_value:.5f}")
print(f"Best params:")

for key, value in brain_study.best_params.items():
    print(f"\t\'{key}\': {value},")