In [22]:
import deepchem as dc
from sklearn.ensemble import RandomForestRegressor
import optuna

import pandas as pd
import numpy as np

In [37]:
def load_split_dfs(input_dir):
    test = []
    for i in range(5):
        test.append(pd.read_pickle(f'{input_dir}/test{i}'))
    return test

def ds_from_df_split(split_dfs, featurizer):
    split_dss = []
    for i in range(5):
        df = split_dfs[i]
        X = featurizer.featurize(df.smiles)
        ds = dc.data.DiskDataset.from_numpy(X=X, y=np.vstack(df.label.to_numpy()), ids=df.smiles)
        split_dss.append(ds)
    all_dss = dc.data.DiskDataset.merge(split_dss)
    
    transformer = dc.trans.NormalizationTransformer(transform_y=True, dataset=all_dss)
    for i in range(5):
        split_dss[i] = transformer.transform(split_dss[i])
    
    return all_dss, split_dss, transformer

def get_kfold_from_ds_split(split_dss):
    kfold = []
    for i in range(5):
        temp_dss = split_dss.copy()
        

def get_random_forest_model(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 1, 500, log=True),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 1000, log=True),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 1000, log=True),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 100, log=True),
    }
    sklearn_random_forest = RandomForestRegressor(**param, n_jobs=-1)
    rf_model = dc.models.SklearnModel(sklearn_random_forest)
    return rf_model

def random_forest_optuna(trial, kfold):
    rf_model = get_random_forest_model(trial)
    mse = []
    for k in kfold:
        rf.fit(k[0])
        y_pred = rf.predict(k[1])
        y_meas = k[1].y
        mse.append(dc.metrics.mean_squared_error(y_meas, y_pred))
        
    return sum(mse)/len(mse)        

In [38]:
split_dfs = load_split_dfs('sma1_random_split')

In [7]:
featurizer = dc.feat.CircularFingerprint(radius=2, size=2048, chiral=True)

In [40]:
all_dss, split_dss, transformer = ds_from_df_split(split_dfs, featurizer)

In [None]:
kfold = []
for i in range(5):
    kfold

In [46]:
test = []
inp = [0, 1, 2, 3, 4]

for i in range(5):
    temp_dss = inp.copy()
    test_ds = temp_dss.pop(i)
    test.append((temp_dss, test_ds))

print(test)

[([1, 2, 3, 4], 0), ([0, 2, 3, 4], 1), ([0, 1, 3, 4], 2), ([0, 1, 2, 4], 3), ([0, 1, 2, 3], 4)]
