In [9]:
import pyemma as pm
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import pandas as pd
import functions as funcs
import pickle
import time
import seaborn as sns

In [26]:
timescales = pd.read_hdf('timescales.h5')
timescales['num_its'] = timescales['num_its'].astype(int)
timescales['hp_index'] = timescales['hp_index'].astype(int)

vamps = pd.read_hdf('vamps.h5')
vamps['k'] = vamps['k'].astype(int)
vamps['hp_index'] = vamps['hp_index'].astype(int)
vamps['lag'] = vamps['lag'].astype(int)



data_dir = Path('/Volumes/REA/Data/fast_folders/model_comparisons/')


In [11]:
timescales = timescales.groupby(['protein', 'hp_index', 'num_its', 'lag'], as_index=False).agg(
                            median = ("value", lambda x: np.quantile(x, 0.5)), 
                            lb= ("value", lambda x: np.quantile(x, 0.025)), 
                            ub = ("value", lambda x: np.quantile(x, 0.975)))
timescales['lb_diff'] = timescales['median'] - timescales['lb']
timescales['ub_diff'] = timescales['ub'] - timescales['median']
timescales.head()

Unnamed: 0,protein,hp_index,num_its,lag,median,lb,ub,lb_diff,ub_diff
0,BBA,0,2,10.0,685.33776,565.724676,1060.147961,119.613084,374.810202
1,BBA,0,2,20.0,1027.890242,871.405174,1328.090976,156.485068,300.200734
2,BBA,0,2,30.0,1271.702454,1094.700741,1686.354349,177.001713,414.651894
3,BBA,0,2,40.0,1501.259543,1277.074258,2070.07946,224.185285,568.819917
4,BBA,0,2,50.0,1696.696799,1405.683312,2531.717636,291.013487,835.020836


In [27]:
vamps = vamps.loc[vamps.method=='VAMP2', :]
vamps = vamps.groupby(['protein','hp_index', 'k', 'lag' ], as_index=False).agg(
                            median = ("value", lambda x: np.quantile(x, 0.5)), 
                            lb= ("value", lambda x: np.quantile(x, 0.025)), 
                            ub = ("value", lambda x: np.quantile(x, 0.975)))
vamps['lb_diff'] = vamps['median'] - vamps['lb']
vamps['ub_diff'] = vamps['ub'] - vamps['median']


In [28]:
chosen_lags = pd.read_hdf('chosen_lag_times.h5')
chosen_ks = pd.read_hdf('chosen_num_dominant.h5')

In [75]:
tables = []
for protein in funcs.PROTEIN_DIRS[:8]:

    model_defs = pd.read_hdf(data_dir.joinpath(protein, 'model_definitions.h5'))
    model_defs.sort_values(by='hp_rank', inplace=True)
    protein_fancy = model_defs.protein[0]

    model_labels = dict(zip(model_defs['hp_index'].values, np.arange(model_defs.shape[0])+1))
    lag = chosen_lags.loc[chosen_lags.protein==protein_fancy, 'lag'].values
    n_dom_proc = chosen_ks.loc[chosen_ks.protein==protein_fancy, 'num_its'].values[0]


    table = model_defs.loc[:, ['protein','hp_index', 'hp_rank', 'lag', 'k', 'feature__value', 'distances__scheme', 'distances__transform', 
                        'distances__centre', 'distances__steepness','tica__lag', 'tica__dim', 'cluster__k' ]]
    table = table.merge(vamps, on=['protein', 'hp_index', 'k', 'lag'], )

    # Nice vamp scores
    table['VAMP-2 score'] = table['median'].apply(lambda x: f"{x:4.3f}")
    table['VAMP-2 \SI{95}{\percent} C.I.'] = table.apply(lambda x: f"[{x['lb']:4.3f}, {x['ub']:4.3f}]", axis=1)

    # Nice numbers
    for col in ['distances__centre', 'distances__steepness']:
        table[col] = table[col].apply(lambda x: f"{x:4.2f}")

    # Nice text
    for col in ['feature__value', 'distances__transform']:
        table[col] = table[col].apply(lambda x: x.capitalize())

    # Better scheme labels
    scheme_labels = {'ca': r'C$\alpha$', 'closest-heavy': 'Closest-Heavy'}
    table['distances__scheme'] = table['distances__scheme'].apply(lambda x: scheme_labels[x])

    # Remove columns
    table.drop(labels=['median', 'lb', 'ub', 'lb_diff', 'ub_diff', 'hp_index'], axis=1, inplace=True)

    # Remove redundant values
    for col in ['distances__scheme', 'distances__transform', 'distances__centre', 'distances__steepness']:
        table[col] = table.apply(lambda x: '-' if x['feature__value']=='Dihedrals' else x[col], axis=1)


    table.index = np.arange(1, model_defs.shape[0]+1).astype(int)
    table.index.name = 'Model'
    
    def f(x):
        labels = {'protein': 'Protein', 'hp_rank': 'Rank', 'lag': 'Lag (ns)', 'k': 'Num. eigenvectors', 'feature__value': 'Feature',
             'distances__scheme': 'Contact scheme', 'distances__transform': 'Transform', 'distances__centre': r'Logistic center (\si{\angstrom})', 
             'distances__steepness': 'Logistic steepness', 'tica__lag': 'TICA lag (ns)', 'tica__dim': 'TICA dimension', 
                  'cluster__k': 'Num. clusters'}
        lab = labels.get(x)
        if lab is None:
            lab = x
        return lab

    table.rename(columns=f, inplace=True)
    tables.append(table)
# table = pd.concat(tables)


In [77]:
for table in tables:
    print(f'\subsubsection{{{table.Protein.values[0]}}}')
    table = table.loc[:, [ 'Rank', 'VAMP-2 score', 'VAMP-2 \SI{95}{\percent} C.I.', 'Num. eigenvectors', 'Lag (ns)', 
                         'Feature', 'Contact scheme', 'Transform', r'Logistic center (\si{\angstrom})', 'Logistic steepness', 'TICA lag (ns)', 
                         'TICA dimension', 'Num. clusters']].T
    print(table.to_latex(index=True, escape=False))

\subsubsection{BBA}
\begin{tabular}{lllll}
\toprule
Model &               1 &               2 &               3 &               4 \\
\midrule
Rank                             &               1 &               3 &              35 &             100 \\
VAMP-2 score                     &           3.882 &           3.873 &           3.723 &           2.505 \\
VAMP-2 \SI{95}{\percent} C.I.    &  [3.840, 3.905] &  [3.834, 3.899] &  [3.631, 3.807] &  [2.442, 2.562] \\
Num. eigenvectors                &               4 &               4 &               4 &               4 \\
Lag (ns)                         &              40 &              40 &              40 &              40 \\
Feature                          &       Distances &       Distances &       Dihedrals &       Distances \\
Contact scheme                   &       C$\alpha$ &       C$\alpha$ &               - &   Closest-Heavy \\
Transform                        &        Logistic &          Linear &               - &        Logist

In [None]:
?tab