In [124]:
import pandas as pd
import optuna
from optuna.samplers import TPESampler

from copy import deepcopy


In [125]:
protein = '1fme'
lag = 41
process = 3


summary_paths = [f'../{protein}/summary_batch1.h5', f'../{protein}/summary_batch2.h5', f'../{protein}/summary_batch3.h5']

hp_paths = ['../../experiments/hpsample.h5', '../../experiments/new_hpsample.h5', '../../experiments/new_ts_hpsample_missing_best.h5']


In [126]:
with pd.HDFStore(summary_paths[0]) as f: 
    print(f.keys())

['/eigenvalue_ratio', '/eigenvalues', '/timescale_gradient', '/timescale_ratio', '/timescales', '/vamp_eqs', '/vamps']


In [127]:

hps = []
veqs = []
gaps = []
for batch_num in range(1):
    
    # Hp definitions
    hp = pd.read_hdf(hp_paths[batch_num])
    hp.reset_index(inplace=True)    
    
    # Drop these columns if they exist. 
    for drop_col in ['index', 'Group']:
        if drop_col in hp.columns:
            hp.drop(labels=[drop_col], inplace=True, axis=1)

    hps.append(hp)
    
    # timescales
    tmp = pd.read_hdf(summary_paths[batch_num], key='vamp_eqs')
    tmp.reset_index(inplace=True)
    
    # Drop these columns if they exist. 
    for drop_col in ['index', 'Group']:
        if drop_col in tmp.columns:
            tmp.drop(labels=[drop_col], inplace=True, axis=1)
    veqs.append(tmp)

    
    # gaps
    tmp = pd.read_hdf(summary_paths[batch_num], key='eigenvalue_ratio')
    tmp.reset_index(inplace=True)
    
    # Drop these columns if they exist. 
    for drop_col in ['index', 'Group']:
        if drop_col in tmp.columns:
            tmp.drop(labels=[drop_col], inplace=True, axis=1)
    gaps.append(tmp)    
       
        
        
veqs = pd.concat(veqs, axis=0)
gaps = pd.concat(gaps, axis=0)
hps = pd.concat(hps, axis=0)    

print(veqs.shape, hps.shape, gaps.shape)

data = veqs.merge(hps, on=['hp_ix'], how='left')
data = data.merge(gaps, on=['hp_ix', 'lag', 'process'], how='left')

data['feature'] = data.apply(lambda x: f"{x['feature__value']}" if x['feature__value'] =='dihedrals' else f"{x['distances__transform']}-{x['feature__value']}", axis=1)
print(data.shape)
data.drop_duplicates(inplace=True)
print(data.shape)


data.sort_values(by=['hp_ix', 'lag', 'process'], inplace=True)


(62661, 7) (140, 14) (61165, 7)
(62661, 25)
(62661, 25)


In [128]:

# optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
db_name = "1fme" 
storage_name = "sqlite:///{}.db".format(db_name)
study_name = 'k3_ev-gap_vamp_eq'


sampler = TPESampler(**TPESampler.hyperopt_parameters())

study = optuna.create_study(study_name=study_name, 
                            sampler=sampler, 
                            storage=storage_name, 
                            directions=["maximize", "maximize"], 
                            load_if_exists=False)


[32m[I 2022-09-16 16:09:50,724][0m A new study created in RDB with name: k3_ev-gap_vamp_eq[0m


In [129]:
old_study = optuna.create_study(storage='sqlite:///1fme-k3-bs-ts_gap-vamp_eq.db', 
                                study_name='1fme-k3-bs', load_if_exists=True, 
                              directions=["maximize", "maximize"], 
                               )

[32m[I 2022-09-16 16:09:52,983][0m Using an existing study with name '1fme-k3-bs' instead of creating a new one.[0m


In [130]:
data_to_add = data.loc[(data.lag==lag) & (data.process==process), :]
data_to_add.head()

Unnamed: 0,hp_ix,lag,process,median_x,lb_x,ub_x,count_x,cluster__max_iter,cluster__stride,tica__dim,...,dihedrals__which,distances__scheme,distances__transform,distances__steepness,distances__centre,median_y,lb_y,ub_y,count_y,feature
201,0,41,3,2.878183,2.821447,2.938427,100,1000,10,19,...,all,,,0.0,0.0,1.016917,1.001594,1.050878,100.0,dihedrals
699,1,41,3,2.895593,2.83173,2.939413,100,1000,10,4,...,,closest-heavy,logistic,18.519573,0.24964,1.021067,1.003773,1.048565,100.0,logistic-distances
1024,2,41,3,2.724913,2.617577,2.804595,100,1000,10,18,...,,closest-heavy,logistic,38.533821,0.23019,1.023588,1.00349,1.066858,100.0,logistic-distances
1426,3,41,3,2.892029,2.842187,2.950856,100,1000,10,4,...,,ca,logistic,32.429605,0.55326,1.008437,1.002085,1.025764,100.0,logistic-distances
1837,4,41,3,2.839644,2.774408,2.937107,100,1000,10,15,...,all,,,0.0,0.0,1.014646,1.00156,1.056166,100.0,dihedrals


In [131]:
data_to_add.loc[data_to_add.feature == 'logistic-distances', 'distances__steepness'].min(), data_to_add.loc[data_to_add.feature == 'logistic-distances', 'distances__centre'].min()

(1.6283273776683507, 0.2069380916635079)

In [132]:
dist_by_param = old_study.trials[10].distributions

In [133]:
dist_by_param['steepness'] = optuna.distributions.FloatDistribution(high=50.0, log=False, low=1.0)
dist_by_param['centre'] = optuna.distributions.FloatDistribution(high=1.5, log=False, low=0.2)


In [134]:
for k, v in data_to_add.groupby(['hp_ix']):
    
    assert v.shape[0] == 1
    
    params = {}
    distributions = {}
    
    params['feature'] = v['feature__value'].values[0]
    distributions['feature'] = deepcopy(dist_by_param['feature'])
    
    if v['feature__value'].values[0] == 'distances': 
        params['transform'] = v['distances__transform'].values[0]
        distributions['transform'] = deepcopy(dist_by_param['transform'])
        
        params['scheme'] = v['distances__scheme'].values[0]
        distributions['scheme'] = deepcopy(dist_by_param['scheme'])

        if params['transform'] == 'logistic': 
            params['centre'] = v['distances__centre'].values[0]
            distributions['centre'] = deepcopy(dist_by_param['centre'])

            params['steepness'] = v['distances__steepness'].values[0]
            distributions['steepness'] = deepcopy(dist_by_param['steepness'])

    params['tica_lag'] = v['tica__lag'].values[0]
    distributions['tica_lag'] = deepcopy(dist_by_param['tica_lag'])
    
    params['tica_dim'] = v['tica__dim'].values[0]
    distributions['tica_dim'] = deepcopy(dist_by_param['tica_dim'])

    params['n_clusters'] = v['cluster__k'].values[0]
    distributions['n_clusters'] = deepcopy(dist_by_param['n_clusters'])
    
    study.add_trial(optuna.trial.create_trial(
        params=params, 
        distributions=distributions, 
        values = [v['median_x'].values[0], v['median_y'].values[0]]
    
    ))
    
    
    