In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path
import seaborn as sns
import functions as funcs

# Introduction

This notebook will select models using two different methods.  

**Method 1**

The best VAMP scores at a given lag and fixed number of implied processes. 

**Method 2**

The best joint ranked VAMP and timescale gap. 



In [2]:
vamps = pd.read_hdf('./summaries/vamps_bs_summary.h5')
vampe = pd.read_hdf('./summaries/vampe_bs_summary.h5')

ratios = pd.read_hdf('./summaries/ts_ratio_bs_summary.h5')
hps = pd.read_hdf('./summaries/hp_summary.h5')
chosen_lags = pd.read_hdf('chosen_lag_times.h5')
chosen_ks = pd.read_hdf('chosen_num_dominant.h5')
lags = dict(zip(chosen_lags['protein'], chosen_lags['lag']))
num_its = dict(zip(chosen_ks['protein'], chosen_ks['num_its']))

In [3]:
ratios.head()

Unnamed: 0,protein,hp_index,lag,num_its,median,lb_diff,ub_diff
0,BBA,0.0,10.0,2.0,1.705263,0.660876,1.31553
1,BBA,0.0,10.0,3.0,1.408334,0.372168,1.497887
2,BBA,0.0,10.0,4.0,1.170877,0.143348,0.827098
3,BBA,0.0,10.0,5.0,1.175784,0.138526,0.475747
4,BBA,0.0,10.0,6.0,1.162832,0.148188,0.554509


In [4]:
def select_rows_by_dict(df, select_dict, selector):
    ix = df.apply(lambda x: x[selector] == select_dict[x['protein']], axis=1)
    return df.loc[ix, :].copy()


vamps = select_rows_by_dict(vamps, lags, 'lag')
ratios = select_rows_by_dict(ratios, lags, 'lag')
vampe = select_rows_by_dict(vampe, lags, 'lag')

# Method 1

## Choose hyperparameters
We'll select the best models per feature based on the selected values of $k$ (`num_its`) and lag time. Also include the wors

In [5]:
df = select_rows_by_dict(vamps, num_its, 'num_its')
df = df.merge(hps.loc[:, ['protein', 'hp_index', 'feature']], on=['protein', 'hp_index'])
df['feature_vamp_rank'] = df.groupby(['protein', 'feature'])['median'].rank(ascending=False)

df['overall_vamp_rank'] = df.groupby(['protein'])['median'].rank(ascending=False)

m1_selection = df.loc[df.feature_vamp_rank == 1.0, :].copy()

# grps = df.groupby(['protein'], as_index=False)
# worst = grps.apply(lambda g: g[g['overall_vamp_rank']==g['overall_vamp_rank'].max()])
# worst.reset_index(inplace=True, drop=True)


# m1_selection = pd.concat([best_per_feature, worst], axis=0)
m1_selection.drop(labels=['feature'], inplace=True, axis=1)
m1_selection = m1_selection.merge(hps, on=['protein', 'hp_index'], how='left')
m1_selection.sort_values(by=['protein', 'overall_vamp_rank'], inplace=True)
# m1_selection.to_hdf('./summaries/m1_model_selection.h5', key='m1_selection')



In [6]:
# old_selection = pd.read_hdf('best_hps_per_feature.h5')

# pd.concat([m1_selection.loc[:, ['overall_vamp_rank', 'hp_index', 'protein']], old_selection.loc[:, ['hp_rank', 'hp_index', 'protein']]], axis=1)

# Choose new num_its

In [7]:
max_ratio = ratios.groupby(['protein', 'hp_index'], as_index=False)['median'].max()
max_ratio = max_ratio.merge(ratios, on=['protein', 'hp_index'], suffixes=('_ratio_max', '_ratio_value'))
max_ratio = max_ratio.loc[np.abs(max_ratio.median_ratio_max-max_ratio.median_ratio_value)<1e-9, :]
m1_selection = m1_selection.merge(max_ratio, on=['protein', 'hp_index'], how='left')
m1_selection.rename(columns=dict(median='median_vamp', lb_diff_x='lb_diff_vamp', ub_diff_x='ub_diff_vamp', lag_x='chosen_lag', num_its_x='chosen_num_its', num_its_y='new_num_its', 
                                lb_diff_y='lb_diff_ratio', ub_diff_y='ub_diff_ratio'), inplace=True)
m1_selection.to_hdf('./summaries/m1_model_selection.h5', key='m1_selection')


In [10]:
m1_selection

Unnamed: 0,protein,hp_index,chosen_lag,chosen_num_its,median_vamp,lb_diff_vamp,ub_diff_vamp,feature_vamp_rank,overall_vamp_rank,cluster__k,...,tica__lag,tica__stride,protein_dir,feature,median_ratio_max,lag_y,new_num_its,median_ratio_value,lb_diff_ratio,ub_diff_ratio
0,BBA,81.0,40.0,4.0,3.881798,0.041517,0.023622,1.0,1.0,349.0,...,63.0,10.0,1fme,logit(dist.),1.448014,40.0,2.0,1.448014,0.420963,26.169422
1,BBA,99.0,40.0,4.0,3.873473,0.039878,0.025896,1.0,3.0,260.0,...,84.0,10.0,1fme,dist.,1.567118,40.0,2.0,1.567118,0.542374,22.10658
2,BBA,47.0,40.0,4.0,3.723278,0.092189,0.084179,1.0,35.0,291.0,...,45.0,10.0,1fme,dihed.,1.823242,40.0,2.0,1.823242,0.744791,2.086422
3,BBL,39.0,30.0,3.0,2.99094,0.015715,0.007356,1.0,1.0,311.0,...,33.0,10.0,2wav,dihed.,17.574127,30.0,2.0,17.574127,15.277985,482.879542
4,BBL,98.0,30.0,3.0,2.989382,0.020162,0.00725,1.0,2.0,316.0,...,44.0,10.0,2wav,dist.,2.775791,30.0,2.0,2.775791,1.679753,32.118444
5,BBL,70.0,30.0,3.0,2.988057,0.01495,0.007196,1.0,13.0,447.0,...,71.0,10.0,2wav,logit(dist.),2.156362,30.0,2.0,2.156362,1.097934,12.607181
6,Chignolin,27.0,20.0,2.0,1.89842,0.027399,0.012619,1.0,1.0,413.0,...,13.0,10.0,cln025,dist.,27.301713,20.0,2.0,27.301713,9.636534,7.789316
7,Chignolin,84.0,20.0,2.0,1.895211,0.024333,0.013697,1.0,17.0,296.0,...,21.0,10.0,cln025,logit(dist.),27.494573,20.0,2.0,27.494573,9.707897,8.007292
8,Chignolin,76.0,20.0,2.0,1.889887,0.025948,0.016377,1.0,33.0,298.0,...,1.0,10.0,cln025,dihed.,7.479769,20.0,2.0,7.479769,2.68779,12.183659
9,Homeodomain,67.0,20.0,4.0,3.995095,0.00809,0.001948,1.0,1.0,458.0,...,6.0,10.0,uvf,dihed.,1.933844,20.0,2.0,1.933844,0.769621,6.645089


# Method 2

Best VAMP scores conditional on timescale gap threshold

In [8]:
threshold = 0.98 # Percentage of timescale gaps to consider. hopefully this give a range of different features. 

In [14]:
df = vamps.merge(ratios, on=['protein', 'hp_index', 'lag', 'num_its'], suffixes=('_vamp', '_gap'))
df = df.merge(hps.loc[:, ['protein', 'hp_index', 'feature']], on=['protein', 'hp_index'])

m2_selection = []
for protein in funcs.PROTEIN_LABELS[:5]+funcs.PROTEIN_LABELS[6:9]:

    q = np.quantile(df.loc[df.protein == protein, 'median_gap'].values, q=threshold)
    best_gap = df.loc[(df.protein==protein) & (df.median_gap > q), :].copy()

    best_gap['feature_vamp_rank'] = best_gap.groupby(['feature'])['median_vamp'].rank(ascending=False)
    best_per_feature = best_gap.loc[best_gap.feature_vamp_rank==1.0, :]
    m2_selection.append(best_per_feature)
    
m2_selection = pd.concat(m2_selection, axis=0)
m2_selection.drop(labels='feature', axis=1,  inplace=True)

m2_selection = m2_selection.merge(hps, on=['protein', 'hp_index'], how='left')
m2_selection.rename(columns={'num_its': 'new_num_its', 'lag': 'chosen_lag'}, inplace=True) # This is to make this consistent with m1 models.  
m2_selection.to_hdf('./summaries/m2_model_selection.h5', key='m2_selection')