In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path
import seaborn as sns
import functions as funcs

# Introduction

This notebook will select models using two different methods.  

**Method 1**

The best VAMP scores at a given lag and fixed number of implied processes. 

**Method 2**

The best joint ranked VAMP and timescale gap. 



In [2]:
vamps = pd.read_hdf('./summaries/vamps_bs_summary.h5')
vampe = pd.read_hdf('./summaries/vampe_bs_summary.h5')

ratios = pd.read_hdf('./summaries/ts_ratio_bs_summary.h5')
hps = pd.read_hdf('./summaries/hp_summary.h5')
chosen_lags = pd.read_hdf('chosen_lag_times.h5')
chosen_ks = pd.read_hdf('chosen_num_dominant.h5')
lags = dict(zip(chosen_lags['protein'], chosen_lags['lag']))
num_its = dict(zip(chosen_ks['protein'], chosen_ks['num_its']))

FileNotFoundError: File ./summaries/vamps_bs_summary.h5 does not exist

In [None]:
ratios.head()

In [None]:
def select_rows_by_dict(df, select_dict, selector):
    ix = df.apply(lambda x: x[selector] == select_dict[x['protein']], axis=1)
    return df.loc[ix, :].copy()


vamps = select_rows_by_dict(vamps, lags, 'lag')
ratios = select_rows_by_dict(ratios, lags, 'lag')
vampe = select_rows_by_dict(vampe, lags, 'lag')

# Method 1

## Choose hyperparameters
We'll select the best models per feature based on the selected values of $k$ (`num_its`) and lag time. Also include the worst and then split it off at the end. 

In [5]:
df = select_rows_by_dict(vamps, num_its, 'num_its')
df = df.merge(hps.loc[:, ['protein', 'hp_index', 'feature']], on=['protein', 'hp_index'])
df['feature_vamp_rank'] = df.groupby(['protein', 'feature'])['median'].rank(ascending=False)

df['overall_vamp_rank'] = df.groupby(['protein'])['median'].rank(ascending=False)

m1_selection = df.loc[df.feature_vamp_rank == 1.0, :].copy()

grps = df.groupby(['protein'], as_index=False)
worst = grps.apply(lambda g: g[g['overall_vamp_rank']==g['overall_vamp_rank'].max()])
worst.reset_index(inplace=True, drop=True)


m1_selection = pd.concat([m1_selection, worst], axis=0)
m1_selection.drop(labels=['feature'], inplace=True, axis=1)
m1_selection = m1_selection.merge(hps, on=['protein', 'hp_index'], how='left')
m1_selection.sort_values(by=['protein', 'overall_vamp_rank'], inplace=True)



In [6]:
m1_selection.head()

Unnamed: 0,protein,hp_index,lag,num_its,median,lb_diff,ub_diff,feature_vamp_rank,overall_vamp_rank,cluster__k,...,distances__centre,distances__scheme,distances__steepness,distances__transform,feature__value,tica__dim,tica__lag,tica__stride,protein_dir,feature
1,BBA,81.0,40.0,4.0,3.881798,0.041517,0.023622,1.0,1.0,349.0,...,0.61237,ca,24.721685,logistic,distances,6.0,63.0,10.0,1fme,logit(dist.)
2,BBA,99.0,40.0,4.0,3.873473,0.039878,0.025896,1.0,3.0,260.0,...,0.799548,ca,8.262109,linear,distances,10.0,84.0,10.0,1fme,dist.
0,BBA,47.0,40.0,4.0,3.723278,0.092189,0.084179,1.0,35.0,291.0,...,0.82218,ca,14.431844,linear,dihedrals,9.0,45.0,10.0,1fme,dihed.
24,BBA,40.0,40.0,4.0,2.505273,0.063739,0.056984,23.0,100.0,96.0,...,1.329784,closest-heavy,17.36807,logistic,distances,1.0,87.0,10.0,1fme,logit(dist.)
3,BBL,39.0,30.0,3.0,2.99094,0.015715,0.007356,1.0,1.0,311.0,...,1.499422,closest-heavy,45.890768,linear,dihedrals,10.0,33.0,10.0,2wav,dihed.


In [7]:
worst.head()

Unnamed: 0,protein,hp_index,lag,num_its,median,lb_diff,ub_diff,feature,feature_vamp_rank,overall_vamp_rank
0,BBA,40.0,40.0,4.0,2.505273,0.063739,0.056984,logit(dist.),23.0,100.0
1,BBL,93.0,30.0,3.0,1.008669,0.004362,0.058434,dihed.,47.0,97.0
2,Chignolin,57.0,20.0,2.0,1.300495,0.028826,0.02321,logit(dist.),23.0,99.0
3,Homeodomain,52.0,20.0,4.0,2.623382,0.507482,0.283287,logit(dist.),23.0,87.0
4,Protein-B,52.0,40.0,3.0,1.805904,0.589591,0.479946,logit(dist.),17.0,87.0


# Choose new num_its

In [8]:
max_ratio = ratios.groupby(['protein', 'hp_index'], as_index=False)['median'].max()
max_ratio = max_ratio.merge(ratios, on=['protein', 'hp_index'], suffixes=('_ratio_max', '_ratio_value'))
max_ratio = max_ratio.loc[np.abs(max_ratio.median_ratio_max-max_ratio.median_ratio_value)<1e-9, :]
m1_selection = m1_selection.merge(max_ratio, on=['protein', 'hp_index'], how='left')
m1_selection.rename(columns=dict(median='median_vamp', lb_diff_x='lb_diff_vamp', ub_diff_x='ub_diff_vamp', lag_x='chosen_lag', num_its_x='chosen_num_its', num_its_y='new_num_its', 
                                lb_diff_y='lb_diff_ratio', ub_diff_y='ub_diff_ratio'), inplace=True)


# select worst
m3_selection = m1_selection.loc[m1_selection.index.isin(m1_selection.groupby('protein')['overall_vamp_rank'].idxmax()), :]
# Select not worst
m1_selection = m1_selection.loc[~m1_selection.index.isin(m1_selection.groupby('protein')['overall_vamp_rank'].idxmax()), :]


m3_selection.to_hdf('./summaries/m3_model_selection.h5', key='m3_selection')
m1_selection.to_hdf('./summaries/m1_model_selection.h5', key='m1_selection')

# Method 2

Best VAMP scores conditional on timescale gap threshold

In [9]:
threshold = 0.98 # Percentage of timescale gaps to consider. hopefully this give a range of different features. 

In [10]:
df = vamps.merge(ratios, on=['protein', 'hp_index', 'lag', 'num_its'], suffixes=('_vamp', '_gap'))
df = df.merge(hps.loc[:, ['protein', 'hp_index', 'feature']], on=['protein', 'hp_index'])

m2_selection = []
for protein in funcs.PROTEIN_LABELS[:5]+funcs.PROTEIN_LABELS[6:9]:

    q = np.quantile(df.loc[df.protein == protein, 'median_gap'].values, q=threshold)
    best_gap = df.loc[(df.protein==protein) & (df.median_gap > q), :].copy()

    best_gap['feature_vamp_rank'] = best_gap.groupby(['feature'])['median_vamp'].rank(ascending=False)
    best_per_feature = best_gap.loc[best_gap.feature_vamp_rank==1.0, :]
    m2_selection.append(best_per_feature)
    
m2_selection = pd.concat(m2_selection, axis=0)
m2_selection.drop(labels='feature', axis=1,  inplace=True)

m2_selection = m2_selection.merge(hps, on=['protein', 'hp_index'], how='left')
m2_selection.rename(columns={'num_its': 'new_num_its', 'lag': 'chosen_lag'}, inplace=True) # This is to make this consistent with m1 models.  
m2_selection.to_hdf('./summaries/m2_model_selection.h5', key='m2_selection')