In [1]:
import deepchem as dc
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import regularizers
import optuna
optuna.logging.set_verbosity(optuna.logging.CRITICAL)

import pandas as pd
import numpy as np

In [2]:
def load_split_dfs(input_dir):
    test = []
    for i in range(5):
        test.append(pd.read_pickle(f'{input_dir}/test{i}'))
    return test

def ds_from_df_split(split_dfs, featurizer):
    split_dss = []
    for i in range(5):
        df = split_dfs[i]
        X = featurizer.featurize(df.smiles)
        ds = dc.data.DiskDataset.from_numpy(X=X, y=np.vstack(df.label.to_numpy()), ids=df.smiles)
        split_dss.append(ds)
    all_dss = dc.data.DiskDataset.merge(split_dss)
    
    transformer = dc.trans.NormalizationTransformer(transform_y=True, dataset=all_dss)
    for i in range(5):
        split_dss[i] = transformer.transform(split_dss[i])
    
    return all_dss, split_dss, transformer

def get_kfold_from_ds_split(split_dss):
    kfold = []
    for i in range(5):
        temp_dss = split_dss.copy()
        temp_test = temp_dss.pop(i)
        kfold.append((dc.data.DiskDataset.merge(temp_dss), temp_test))
    return kfold

In [66]:
def random_forest_model_from_trial(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 1, 500, log=True),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 1000, log=True),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 1000, log=True),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 100, log=True),
    }
    sklearn_random_forest = RandomForestRegressor(**param, n_jobs=-1)
    rf_model = dc.models.SklearnModel(sklearn_random_forest)
    return rf_model

def random_forest_model_from_param(param):
    sklearn_random_forest = RandomForestRegressor(**param, n_jobs=-1)
    rf_model = dc.models.SklearnModel(sklearn_random_forest)
    return rf_model

def random_forest_optuna(trial, kfold):
    rf_model = random_forest_model_from_trial(trial)
    mse = []
    for k in kfold:
        rf_model.fit(k[0].complete_shuffle())
        y_pred = rf_model.predict(k[1])
        y_meas = k[1].y
        mse.append(dc.metrics.mean_squared_error(y_meas, y_pred))
        
    return sum(mse)/len(mse)    

In [49]:
def feedforward_neural_network_from_trial(trial, len_feat_vec):
    n_layers = trial.suggest_int('n_layers', 1, 3)
    weight_decay = trial.suggest_float('weight_decay', 1e-10, 1e-3, log=True)
    fnn_model = tf.keras.Sequential()
    fnn_model.add(tf.keras.layers.Input(shape=(len_feat_vec,)))
    # dropout = trial.suggest_float(f'dropout_{0}', 0.0, 1.0)
    # model.add(tf.keras.layers.Dropout(dropout))
    for i in range(n_layers):
        num_nodes = trial.suggest_int(f'n_nodes_{i+1}', 4, 128, log=True)
        dropout = trial.suggest_float(f'dropout_{i+1}', 0.0, 1.0)
        fnn_model.add(
            tf.keras.layers.Dense(
                num_nodes,
                activation='relu',
                kernel_regularizer=tf.keras.regularizers.l2(weight_decay)
            )
        )
        fnn_model.add(tf.keras.layers.Dropout(dropout))
    fnn_model.add(
        tf.keras.layers.Dense(1, kernel_regularizer=tf.keras.regularizers.l2(weight_decay))
    )
    rate = trial.suggest_float('learning_rate', 1e-8, 1e-1, log=True)
    dc_model = dc.models.KerasModel(model=fnn_model, loss=dc.models.losses.L2Loss(), learning_rate=rate)
    return dc_model

def feedforward_neural_network_from_param(param):
    pass

def feedforward_neural_network_optuna(trial, kfold, transformer, len_feat_vec):
    mse = []
    
    for i,k in enumerate(kfold):
        fnn_model = feedforward_neural_network_from_trial(trial, len_feat_vec)
        metric = dc.metrics.Metric(dc.metrics.mean_squared_error)
        # callback = dc.models.callbacks.ValidationCallback(dataset=k[1], interval=10, metrics=[metric], transformers=[transformer])
        callback = dc.models.callbacks.ValidationCallback(dataset=k[1], interval=100, metrics=[metric], save_dir='tmp')
        fnn_model.fit(dataset=k[0].complete_shuffle(), nb_epoch=100, callbacks=callback)
        mse.append(callback._best_score)
        
    return sum(mse)/len(mse)

In [57]:
def get_feedforward_NN_model(trial):
    pass

def feedforward_NN_optuna(trial):
    pass

In [7]:
split_dfs = load_split_dfs('sma1_random_split')

In [51]:
LEN_FEAT_VEC = 2048
featurizer = dc.feat.CircularFingerprint(radius=2, size=LEN_FEAT_VEC, chiral=True)

In [9]:
all_dss, split_dss, transformer = ds_from_df_split(split_dfs, featurizer)

In [10]:
train_tests = get_kfold_from_ds_split(split_dss)

In [None]:
output_info = []

for i,tt in enumerate(train_tests):
    splitter = dc.splits.RandomSplitter()
    kfold = splitter.k_fold_split(dataset=tt[0], k=5)
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: feedforward_neural_network_optuna(trial, kfold, transformer, LEN_FEAT_VEC), n_trials=1)
    
    test_mse = []
    for j in range(5):
        tuned_fnn_model = feedforward_neural_network_from_trial(study.best_trial, LEN_FEAT_VEC)
        metric = dc.metrics.Metric(dc.metrics.mean_squared_error)
        # callback = dc.models.callbacks.ValidationCallback(dataset=k[1], interval=10, metrics=[metric], transformers=[transformer])
        callback = dc.models.callbacks.ValidationCallback(dataset=tt[1], interval=100, metrics=[metric], save_dir='tmp')
        tuned_fnn_model.fit(dataset=tt[0].complete_shuffle(), nb_epoch=100, callbacks=callback)
        test_mse.append(callback._best_score)
    
    output_info.append((i, study.best_value, str(study.best_params), sum(test_mse)/len(test_mse), test_mse))

In [54]:
out_df = pd.DataFrame(output_info, columns=['split_index', 'avg_valid_mse', 'best_params', 'avg_test_mse', 'test_mses'])
out_df

Unnamed: 0,split_index,avg_valid_mse,best_params,avg_test_mse,test_mses
0,4,1.019483,"{'n_layers': 3, 'weight_decay': 3.772144576138...",0.842647,"[0.8296379533441094, 0.8381851213995676, 0.851..."
1,4,0.902524,"{'n_layers': 2, 'weight_decay': 3.235418746670...",0.97507,"[0.9859052387413431, 0.9738764414742868, 0.951..."
2,4,0.957456,"{'n_layers': 1, 'weight_decay': 1.299267424835...",0.851133,"[0.8313375567924828, 0.8662653812634217, 0.859..."
3,4,0.998971,"{'n_layers': 1, 'weight_decay': 1.131269472048...",1.168706,"[1.171167094846065, 1.1825770070344988, 1.1512..."
4,4,0.993971,"{'n_layers': 3, 'weight_decay': 2.151749665807...",1.055636,"[1.049748495713449, 1.0418304834274834, 1.0517..."
