In [None]:
import os
from visar.model_training_utils import ST_model_hyperparam_screen, ST_model_training
os.environ['CUDA_VISIBLE_DEVICES']='0'

## model setup

In [None]:
# initialize parameters
task_names = ['T107', 'T108']
MT_dat_name = './data/MT_data_clean_June28.csv'
FP_type = 'Circular_2048'

params_dict = {
    "n_tasks": [1],
    
    "n_features": [2048], ## need modification given FP types
    "activation": ['relu'],
    "momentum": [.9],
    "batch_size": [128],
    "init": ['glorot_uniform'],
    "learning_rate": [0.01],
    "decay": [1e-6],
    "nb_epoch": [30],
    "dropouts": [.2, .4],
    "nb_layers": [1],
    "batchnorm": [False],
    "layer_sizes": [(1024, 512),(1024,128) ,(512, 128),(512,64),(128,64),(64,32), 
                    (1024,512,128), (512,128,64), (128,64,32)],
    "penalty": [0.1]
}

In [None]:
# initialize model setup
import random
import time
random_seed = random.randint(0,1000)
local_time = time.localtime(time.time())
log_path = './logs/'
RUN_KEY = 'ST_%d_%d_%d_%d' % (local_time.tm_year, local_time.tm_mon, 
                              local_time.tm_mday, random_seed)
os.system('mkdir %s%s' % (log_path, RUN_KEY))
print(RUN_KEY)

## hyperparameter screening

In [None]:
# hyperparam screening using deepchem
log_output = ST_model_hyperparam_screen(MT_dat_name, task_names, FP_type, params_dict, 
                                        log_path = './logs/'+RUN_KEY)

In [None]:
# option1: hyperparameter automatic selction
hyper_param_df = pd.read_csv('./logs/' + RUN_KEY + '/hyperparam_log.txt', header = None, sep = '\t')
hyper_param_df.columns = ['rep_label', 'task_name', 'param', 'r2_score']
hyper_param_df = hyper_param_df.sort_values(by = ['task_name', 'param', 'rep_label'], axis = 0)

best_hyperparams = {}
for task in task_names:
    hyper_stat = hyper_param_df.loc[hyper_param_df['task_name'] == task].groupby('param').agg({'r2_score': ['mean','max','std']})
    valid_mask = hyper_stat['r2_score']['std'] < 0.15 # filter out ones without reasonable generalization power
    hyper_stat = hyper_stat.loc[valid_mask]
    if hyper_stat.shape[0] >= 1:
        select_param = hyper_stat['r2_score']['max'].sort_values(ascending=False).index[0]
        select_r2 = hyper_stat['r2_score']['max'].sort_values(ascending=False)[0]
        tmp_layer1 = int(select_param.split(', ')[8].split('(')[1])
        tmp_layer2 = int(select_param.split(', ')[9].split(')')[0])
        tmp_drop = float(select_param.split(', ')[2])
        best_hyperparams[task] = [(tmp_layer1, tmp_layer2, 1), tmp_drop]
        print(task + ': ' + str(hyper_stat.shape[0]) + ', ' + str(select_r2))
    else:
        print(task_name + ' with training variance too high.')
        continue

In [None]:
# option2: manually pick the training parameters, referring to hyperparam_log saved in RUN_KEY directory
best_hyperparams = {'T107': [(512,64,1), 0.4],
                    'T108': [(512,128,1), 0.2]
                   }

## model training

In [None]:
MT_dat_name = './data/MT_data_clean_June28.csv'
FP_type = 'Circular_2048'
RUN_KEY = 'ST_2019_8_14_986'

In [None]:
# model training
output_df = ST_model_training(MT_dat_name, FP_type, 
                              best_hyperparams, result_path = './logs/'+RUN_KEY)

In [None]:
from visar.VISAR_model_utils import generate_performance_plot_ST
import seaborn as sns
plot_df = generate_performance_plot_ST('./logs/ST_2019_8_14_986/performance_metrics.csv')

In [None]:
g = sns.catplot(x = 'task', y = 'value', hue = 'method', 
                col = 'tt', row = 'performance', 
                data = plot_df, kind = 'bar')

## process trained results for VISAR analysis

In [None]:
from visar.VISAR_model_utils import generate_RUNKEY_dataframe_ST
RUN_KEY = 'ST_2019_8_14_986'
log_path = './logs/'
prev_model = log_path + RUN_KEY + '/T107_rep2_50.hdf5'
output_prefix = 'T107_rep2_50_'
task_list = ['T107']
add_features = None
dataset_file = log_path + RUN_KEY + '/temp.csv'
FP_type = 'Circular_2048'

In [None]:
generate_RUNKEY_dataframe_ST(prev_model, output_prefix, task_list, dataset_file, FP_type, add_features, 
                             n_layer = 1)

Next:
- copy output files (including output_compound_df, output_batch_df, output_task_df) to a data directory, and clear the VISAR_webapp static directory if neccessary;
- start the app in prompt window by 'bokeh serve --show VISAR_webapp'