In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path
import seaborn as sns
import functions as funcs

# Introduction

This notebook will select models using two different methods.  

**Method 1**

The best VAMP scores at a given lag and fixed number of implied processes. 

**Method 2**

The best joint ranked VAMP and timescale gap. 



In [2]:
vamps = pd.read_hdf('./summaries/vamps_bs_summary.h5')
vampe = pd.read_hdf('./summaries/vampe_bs_summary.h5')

ratios = pd.read_hdf('./summaries/ts_ratio_bs_summary.h5')
hps = pd.read_hdf('./summaries/hp_summary.h5')
chosen_lags = pd.read_hdf('chosen_lag_times.h5')
chosen_ks = pd.read_hdf('chosen_num_dominant.h5')
lags = dict(zip(chosen_lags['protein'], chosen_lags['lag']))
num_its = dict(zip(chosen_ks['protein'], chosen_ks['num_its']))

In [3]:
ratios.head()

Unnamed: 0,protein,hp_index,lag,num_its,median,lb_diff,ub_diff
0,BBA,0.0,10.0,2.0,1.705263,0.660876,1.31553
1,BBA,0.0,10.0,3.0,1.408334,0.372168,1.497887
2,BBA,0.0,10.0,4.0,1.170877,0.143348,0.827098
3,BBA,0.0,10.0,5.0,1.175784,0.138526,0.475747
4,BBA,0.0,10.0,6.0,1.162832,0.148188,0.554509


In [4]:
def select_rows_by_dict(df, select_dict, selector):
    ix = df.apply(lambda x: x[selector] == select_dict[x['protein']], axis=1)
    return df.loc[ix, :].copy()


vamps = select_rows_by_dict(vamps, lags, 'lag')
ratios = select_rows_by_dict(ratios, lags, 'lag')
vampe = select_rows_by_dict(vampe, lags, 'lag')

# Method 1

We'll select the best models per feature based on the selected values of $k$ (`num_its`) and lag time. Also include the wors

In [7]:
df = select_rows_by_dict(vamps, num_its, 'num_its')
df = df.merge(hps.loc[:, ['protein', 'hp_index', 'feature']], on=['protein', 'hp_index'])
df['feature_vamp_rank'] = df.groupby(['protein', 'feature'])['median'].rank(ascending=False)

df['overall_vamp_rank'] = df.groupby(['protein'])['median'].rank(ascending=False)

m1_selection = df.loc[df.feature_vamp_rank == 1.0, :].copy()

# grps = df.groupby(['protein'], as_index=False)
# worst = grps.apply(lambda g: g[g['overall_vamp_rank']==g['overall_vamp_rank'].max()])
# worst.reset_index(inplace=True, drop=True)


# m1_selection = pd.concat([best_per_feature, worst], axis=0)
m1_selection.drop(labels=['feature'], inplace=True, axis=1)
m1_selection = m1_selection.merge(hps, on=['protein', 'hp_index'], how='left')
m1_selection.sort_values(by=['protein', 'overall_vamp_rank'], inplace=True)
m1_selection.to_hdf('./summaries/m1_model_selection.h5', key='m1_selection')



In [7]:
# old_selection = pd.read_hdf('best_hps_per_feature.h5')

# pd.concat([m1_selection.loc[:, ['overall_vamp_rank', 'hp_index', 'protein']], old_selection.loc[:, ['hp_rank', 'hp_index', 'protein']]], axis=1)

# Method 2

Best VAMP scores conditional on timescale gap threshold

In [10]:
threshold = 0.98 # Percentage of timescale gaps to consider. hopefully this give a range of different features. 

In [19]:
df = vamps.merge(ratios, on=['protein', 'hp_index', 'lag', 'num_its'], suffixes=('_vamp', '_gap'))
df = df.merge(hps.loc[:, ['protein', 'hp_index', 'feature']], on=['protein', 'hp_index'])

m2_selection = []
for protein in funcs.PROTEIN_LABELS[:5]+funcs.PROTEIN_LABELS[6:9]:

    q = np.quantile(df.loc[df.protein == protein, 'median_gap'].values, q=threshold)
    best_gap = df.loc[(df.protein==protein) & (df.median_gap > q), :].copy()

    best_gap['feature_vamp_rank'] = best_gap.groupby(['feature'])['median_vamp'].rank(ascending=False)
    best_per_feature = best_gap.loc[best_gap.feature_vamp_rank==1.0, :]
    m2_selection.append(best_per_feature)
    
m2_selection = pd.concat(m2_selection, axis=0)
m2_selection.drop(labels='feature', axis=1,  inplace=True)

m2_selection = m2_selection.merge(hps, on=['protein', 'hp_index'], how='left')

m2_selection.to_hdf('./summaries/m2_model_selection.h5', key='m2_selection')