In [1]:
import pandas as pd
import optuna
from optuna.samplers import TPESampler
from copy import deepcopy

  from .autonotebook import tqdm as notebook_tqdm


In [158]:
# protein = 'CLN'
protein = 'BBA'

#lag = 31
lag = 41
process = 2

# best_hp_ix = [74, 24, 6]
best_hp_ix = [24]

# summary_paths = [f'../{protein}/summary.h5']
summary_paths = [f'../{protein}/summary_batch1.h5']

# hp_paths = ['../../../hpsample_stride1.h5']
hp_paths = [f'../{protein}/hpsample.h5']

db_name = protein
storage_name = "sqlite:///../{}/{}.db".format(protein, db_name)
study_name = 'gap-ev2ev3_vampeq2'
new_gamma = False

In [159]:
with pd.HDFStore('../BBA/summary_batch1.h5') as f: 
    print(f.keys())

['/evs', '/timescale_gradient', '/timescale_ratio', '/timescales', '/vampeq2', '/vamps']


---
### Create a new study 

In [165]:
hps = []
ts = []
gaps = []
vampeq2 = []

for batch_num in range(1):
    
    # Hp definitions
    hp = pd.read_hdf(hp_paths[batch_num])
    hp.reset_index(inplace=True)    
    
    # Drop these columns if they exist. 
    for drop_col in ['index', 'Group']:
        if drop_col in hp.columns:
            hp.drop(labels=[drop_col], inplace=True, axis=1)
    hps.append(hp)
    
    
    # timescales
    tmp = pd.read_hdf(summary_paths[batch_num], key='timescales')
    tmp.reset_index(inplace=True)
    
    # Drop these columns if they exist. 
    for drop_col in ['index', 'Group']:
        if drop_col in tmp.columns:
            tmp.drop(labels=[drop_col], inplace=True, axis=1)
    tmp.rename(columns = {'median':'median_ts',
                          'lb':'lb_ts',
                          'ub':'ub_ts',
                          'count':'count_ts'}, inplace=True)
    ts.append(tmp)

    
    # gaps
    tmp = pd.read_hdf(summary_paths[batch_num], key='timescale_ratio')
    tmp.reset_index(inplace=True)
    
    # Drop these columns if they exist. 
    for drop_col in ['index', 'Group']:
        if drop_col in tmp.columns:
            tmp.drop(labels=[drop_col], inplace=True, axis=1)
    tmp.rename(columns = {'median':'median_gap',
                          'lb':'lb_gap',
                          'ub':'ub_gap',
                          'count':'count_gap'}, inplace=True)
    gaps.append(tmp)    
    
    
    # vampeq2
    tmp = pd.read_hdf(summary_paths[batch_num], key='vampeq2')
    tmp.reset_index(inplace=True)
    
    # Drop these columns if they exist. 
    for drop_col in ['index', 'Group']:
        if drop_col in tmp.columns:
            tmp.drop(labels=[drop_col], inplace=True, axis=1)
    tmp.rename(columns = {'median':'median_vampeq2',
                          'lb':'lb_vampeq2',
                          'ub':'ub_vampeq2',
                          'count':'count_vampeq2'}, inplace=True)
    vampeq2.append(tmp)    

hps = pd.concat(hps, axis=0)                  
ts = pd.concat(ts, axis=0)
gaps = pd.concat(gaps, axis=0)
vampeq2 = pd.concat(vampeq2, axis=0)    

print(ts.shape, hps.shape, gaps.shape, vampeq2.shape)

data = hps.merge(ts, on=['hp_ix'], how='left')
data = data.merge(gaps, on=['hp_ix', 'lag', 'process'], how='left')
data = data.merge(vampeq2, on=['hp_ix', 'lag', 'process'], how='left')

data['feature'] = data.apply(lambda x: f"{x['feature__value']}" if x['feature__value'] =='dihedrals' else f"{x['distances__transform']}-{x['feature__value']}", axis=1)
data.drop_duplicates(inplace=True)
print('\n', data.shape)
data.sort_values(by=['hp_ix', 'lag', 'process'], inplace=True)

(68711, 7) (140, 14) (68711, 7) (1617, 7)

 (62665, 29)


In [168]:
# optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))

# We may change the default gamma function
def hyperopt_gamma(x: int) -> int:
    return min(int(np.ceil(0.25 * np.sqrt(x))), 13)

hyperopt_parameters = TPESampler.hyperopt_parameters()
if new_gamma: hyperopt_parameters['gamma'] = hyperopt_gamma

sampler = TPESampler(**TPESampler.hyperopt_parameters(), multivariate=True)

study = optuna.create_study(study_name=study_name, 
                            sampler=sampler, 
                            storage=storage_name, 
                            directions=["maximize","maximize"], 
#                            direction='maximize',
                            load_if_exists=True)

[32m[I 2023-03-10 13:41:31,663][0m Using an existing study with name 'gap-ev2ev3_vampeq2' instead of creating a new one.[0m


In [169]:
data_to_add = data.loc[(data.lag==lag) & (data.process==process) , :]

# add the following to exclude the best trials
# & (~data.hp_ix.isin(best_hp_ix))

data_to_add.shape

(136, 29)

In [171]:
dist_by_param = {}
dist_by_param['steepness'] = optuna.distributions.FloatDistribution(high=50.0, log=False, low=1.0)
dist_by_param['centre'] = optuna.distributions.FloatDistribution(high=1.5, log=False, low=0.2)

dist_by_param['feature'] = optuna.distributions.CategoricalDistribution(['dihedrals', 'distances'])
dist_by_param['transform'] = optuna.distributions.CategoricalDistribution(['logistic', 'linear'])
dist_by_param['scheme'] = optuna.distributions.CategoricalDistribution(['ca', 'closest-heavy'])

dist_by_param['tica_lag'] = optuna.distributions.IntDistribution(high=100, log=False, low=1)
dist_by_param['tica_dim'] = optuna.distributions.IntDistribution(high=20, log=False, low=1)
dist_by_param['n_clusters'] = optuna.distributions.IntDistribution(high=500, log=False, low=10)

In [173]:
for k, v in data_to_add.groupby(['hp_ix']):
    
    assert v.shape[0] == 1
    
    params = {}
    distributions = {}
    
    params['feature'] = v['feature__value'].values[0]
    distributions['feature'] = deepcopy(dist_by_param['feature'])
    
    if v['feature__value'].values[0] == 'distances': 
        params['transform'] = v['distances__transform'].values[0]
        distributions['transform'] = deepcopy(dist_by_param['transform'])
        
        params['scheme'] = v['distances__scheme'].values[0]
        distributions['scheme'] = deepcopy(dist_by_param['scheme'])

        if params['transform'] == 'logistic': 
            params['centre'] = v['distances__centre'].values[0]
            distributions['centre'] = deepcopy(dist_by_param['centre'])

            params['steepness'] = v['distances__steepness'].values[0]
            distributions['steepness'] = deepcopy(dist_by_param['steepness'])

    params['tica_lag'] = v['tica__lag'].values[0]
    distributions['tica_lag'] = deepcopy(dist_by_param['tica_lag'])
    
    params['tica_dim'] = v['tica__dim'].values[0]
    distributions['tica_dim'] = deepcopy(dist_by_param['tica_dim'])

    params['n_clusters'] = v['cluster__k'].values[0]
    distributions['n_clusters'] = deepcopy(dist_by_param['n_clusters'])
    
    # Median x: timescale 
    # Median y: process 2/3 timescale ratio
    
    study.add_trial(optuna.trial.create_trial(
        params=params, 
        distributions=distributions, 
        values = [v['median_gap'].values[0], v['median_vampeq2'].values[0]]
    ))

In [174]:
study.trials[10].distributions

{'feature': CategoricalDistribution(choices=('dihedrals', 'distances')),
 'tica_lag': IntDistribution(high=100, log=False, low=1, step=1),
 'tica_dim': IntDistribution(high=20, log=False, low=1, step=1),
 'n_clusters': IntDistribution(high=500, log=False, low=10, step=1)}