In [39]:
import deepchem as dc
from sklearn.ensemble import RandomForestRegressor
import optuna
optuna.logging.set_verbosity(optuna.logging.CRITICAL)

import pandas as pd
import numpy as np

In [35]:
def load_split_dfs(input_dir):
    test = []
    for i in range(5):
        test.append(pd.read_pickle(f'{input_dir}/test{i}'))
    return test

def ds_from_df_split(split_dfs, featurizer):
    split_dss = []
    for i in range(5):
        df = split_dfs[i]
        X = featurizer.featurize(df.smiles)
        ds = dc.data.DiskDataset.from_numpy(X=X, y=np.vstack(df.label.to_numpy()), ids=df.smiles)
        split_dss.append(ds)
    all_dss = dc.data.DiskDataset.merge(split_dss)
    
    transformer = dc.trans.NormalizationTransformer(transform_y=True, dataset=all_dss)
    for i in range(5):
        split_dss[i] = transformer.transform(split_dss[i])
    
    return all_dss, split_dss, transformer

def get_kfold_from_ds_split(split_dss):
    kfold = []
    for i in range(5):
        temp_dss = split_dss.copy()
        temp_test = temp_dss.pop(i)
        kfold.append((dc.data.DiskDataset.merge(temp_dss), temp_test))
    return kfold

def get_random_forest_model(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 1, 500, log=True),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 1000, log=True),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 1000, log=True),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 100, log=True),
    }
    sklearn_random_forest = RandomForestRegressor(**param, n_jobs=-1)
    rf_model = dc.models.SklearnModel(sklearn_random_forest)
    return rf_model

def random_forest_optuna(trial, kfold):
    rf_model = get_random_forest_model(trial)
    mse = []
    for k in kfold:
        rf_model.fit(k[0])
        y_pred = rf_model.predict(k[1])
        y_meas = k[1].y
        mse.append(dc.metrics.mean_squared_error(y_meas, y_pred))
        
    return sum(mse)/len(mse)        

In [3]:
split_dfs = load_split_dfs('sma1_random_split')

In [4]:
featurizer = dc.feat.CircularFingerprint(radius=2, size=2048, chiral=True)

In [5]:
all_dss, split_dss, transformer = ds_from_df_split(split_dfs, featurizer)

In [21]:
train_tests = get_kfold_from_ds_split(split_dss)

In [None]:
for tt in train_tests:
    splitter = dc.splits.RandomSplitter()
    kfold = splitter.k_fold_split(dataset=tt[0], k=5)
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: random_forest_optuna(trial, kfold), n_trials=1000)
    print(f'average mse on validation sets in 5-fold CV: {study.best_value}')
    print(f'best_params: {study.best_params}')
    
    tuned_rf_model = get_random_forest_model(study.best_trial)
    tuned_rf_model.fit(tt[0]) 
    y_pred = tuned_rf_model.predict(tt[1])
    y_meas = tt[1].y
    test_mse = dc.metrics.mean_squared_error(y_meas, y_pred)
    print(f'mse on test set is: {test_mse}\n')