In [1]:
import deepchem as dc
import xgboost as xgb
import optuna
optuna.logging.set_verbosity(optuna.logging.CRITICAL)

import pandas as pd
import numpy as np

In [2]:
def load_split_dfs(input_dir):
    test = []
    for i in range(5):
        test.append(pd.read_pickle(f'{input_dir}/test{i}'))
    return test

def ds_from_df_split(split_dfs, featurizer):
    split_dss = []
    for i in range(5):
        df = split_dfs[i]
        X = featurizer.featurize(df.smiles)
        ds = dc.data.DiskDataset.from_numpy(X=X, y=np.vstack(df.label.to_numpy()), ids=df.smiles)
        split_dss.append(ds)
    all_dss = dc.data.DiskDataset.merge(split_dss)
    
    transformer = dc.trans.NormalizationTransformer(transform_y=True, dataset=all_dss)
    for i in range(5):
        split_dss[i] = transformer.transform(split_dss[i])
    
    return all_dss, split_dss, transformer

def get_kfold_from_ds_split(split_dss):
    kfold = []
    for i in range(5):
        temp_dss = split_dss.copy()
        temp_test = temp_dss.pop(i)
        kfold.append((dc.data.DiskDataset.merge(temp_dss), temp_test))
    return kfold

In [7]:
def random_forest_model_from_trial(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 1, 500, log=True),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 1000, log=True),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 1000, log=True),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 100, log=True),
    }
    sklearn_random_forest = RandomForestRegressor(**param, n_jobs=-1)
    rf_model = dc.models.SklearnModel(sklearn_random_forest)
    return rf_model

def random_forest_model_from_param(param):
    sklearn_random_forest = RandomForestRegressor(**param, n_jobs=-1)
    rf_model = dc.models.SklearnModel(sklearn_random_forest)
    return rf_model

def random_forest_optuna(trial, kfold):
    mse = []
    for k in kfold:
        rf_model = random_forest_model_from_trial(trial)
        rf_model.fit(k[0].complete_shuffle())
        y_pred = rf_model.predict(k[1])
        y_meas = k[1].y
        mse.append(dc.metrics.mean_squared_error(y_meas, y_pred))
        
    return sum(mse)/len(mse)    

In [None]:
def extreme_gradient_booster_from_trial(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 9, step=2),
        'learning_rate': trial.suggest_float("learning_rate", 1e-8, 1.0, log=True),
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        "min_child_weight": trial.suggest_int("min_child_weight", 2, 10),
        # 'max_delta_step': 
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0, log=True),
    }
    xgboost_regressor = xgb.XGBRegressor(**param, n_estimators=100, objective='reg:squarederror', tree_method='exact', verbosity=0, n_jobs=-1)
    xgb_model = dc.models.GBDTModel(model=xgboost_regressor, early_stopping_rounds=20, eval_metric='rmse')
    return xgb_model

def extreme_gradient_booster_optuna(trial, kfold):
    mse = []
    for k in kfold:
        xgb_model = extreme_gradient_booster_from_trial(trial)
        xgb_model.fit_with_eval(k[0].complete_shuffle(), k[1])
        y_pred = xgb_model.predict(k[1])
        y_meas = k[1].y
        mse.append(dc.metrics.mean_squared_error(y_meas, y_pred))
    return sum(mse)/len(mse)

In [4]:
split_dfs = load_split_dfs('sma1_random_split')

In [5]:
LEN_FEAT_VEC = 2048
featurizer = dc.feat.CircularFingerprint(radius=2, size=LEN_FEAT_VEC, chiral=True)

In [6]:
all_dss, split_dss, transformer = ds_from_df_split(split_dfs, featurizer)

In [7]:
train_tests = get_kfold_from_ds_split(split_dss)

In [None]:
output_info = []

for i,tt in enumerate(train_tests):
    splitter = dc.splits.RandomSplitter()
    kfold = splitter.k_fold_split(dataset=tt[0], k=5)
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: extreme_gradient_booster_optuna(trial, kfold), n_trials=1)
    
    test_mse = []
    for j in range(5):
        tuned_xgb_model = extreme_gradient_booster_from_trial(study.best_trial)
        tuned_xgb_model.fit_with_eval(tt[0].complete_shuffle(), tt[1])
        y_pred = tuned_xgb_model.predict(tt[1])
        y_meas = tt[1].y
        test_mse.append(dc.metrics.mean_squared_error(y_meas, y_pred))
    
    output_info.append((i, study.best_value, str(study.best_params), sum(test_mse)/len(test_mse), test_mse))

[0]	validation_0-rmse:1.11596
[1]	validation_0-rmse:1.11591
[2]	validation_0-rmse:1.11586
[3]	validation_0-rmse:1.11581
[4]	validation_0-rmse:1.11576
[5]	validation_0-rmse:1.11571
[6]	validation_0-rmse:1.11566
[7]	validation_0-rmse:1.11561
[8]	validation_0-rmse:1.11556
[9]	validation_0-rmse:1.11552
[10]	validation_0-rmse:1.11547
[11]	validation_0-rmse:1.11542
[12]	validation_0-rmse:1.11537
[13]	validation_0-rmse:1.11532
[14]	validation_0-rmse:1.11527
[15]	validation_0-rmse:1.11522
[16]	validation_0-rmse:1.11517
[17]	validation_0-rmse:1.11512
[18]	validation_0-rmse:1.11507
[19]	validation_0-rmse:1.11502
[20]	validation_0-rmse:1.11498
[21]	validation_0-rmse:1.11493
[22]	validation_0-rmse:1.11488
[23]	validation_0-rmse:1.11483
[24]	validation_0-rmse:1.11478
[25]	validation_0-rmse:1.11473
[26]	validation_0-rmse:1.11468
[27]	validation_0-rmse:1.11463
[28]	validation_0-rmse:1.11458
[29]	validation_0-rmse:1.11453
[30]	validation_0-rmse:1.11448
[31]	validation_0-rmse:1.11444
[32]	validation_0-

In [17]:
out_df = pd.DataFrame(output_info, columns=['split_index', 'avg_valid_mse', 'best_params', 'avg_test_mse', 'test_mses'])
out_df

Unnamed: 0,split_index,avg_valid_mse,best_params,avg_test_mse,test_mses
0,0,1.267674,"{'max_depth': 7, 'learning_rate': 2.4997980358...",1.175157,"[1.1751569954605625, 1.1751569954605625, 1.175..."
1,1,1.237327,"{'max_depth': 3, 'learning_rate': 7.5876229123...",1.231939,"[1.231879692471963, 1.2319951985990796, 1.2319..."
2,2,1.274578,"{'max_depth': 3, 'learning_rate': 3.1962045698...",1.151279,"[1.151278512250133, 1.1512785081152428, 1.1512..."
3,3,0.967727,"{'max_depth': 5, 'learning_rate': 0.0001216151...",1.108013,"[1.1080118690098781, 1.1080137220715216, 1.108..."
4,4,1.044351,"{'max_depth': 3, 'learning_rate': 9.9790180820...",1.119327,"[1.1193267218471303, 1.11932643434555, 1.11932..."
