In [38]:
import numpy as np
from pathlib import Path
import pandas as pd



In [64]:
summary_path = '../data/msms/1fme/summary.h5'
selection = pd.read_hdf(summary_path, key='model_selection')
hp_definitions = pd.read_hdf('../data/msms/hpsample.h5', key='hyperparameters')
vamps = pd.read_hdf(summary_path, key='vamps')
vamps.reset_index(inplace=True)

selection = selection.loc[:, ['hp_ix', 'method', 'feature', 'process', 'lag']]

In [65]:
vamps.head()

Unnamed: 0,hp_ix,lag,process,median,lb,ub,count
0,28,1,2,1.991242,1.987578,1.997172,100
1,28,1,3,2.980015,2.968074,2.987502,100
2,28,1,4,3.965492,3.947025,3.975981,100
3,28,1,5,4.94692,4.922022,4.963168,100
4,28,1,6,5.92558,5.890932,5.947508,100


Prettify some labels

In [66]:
def f(x, y):
    return f"{100*(1-x/y):3.1f}"

vamps['$\Delta$ VAMP-2'] = vamps.apply(lambda x: f"{f(x['median'], x['process'])} \%, [{f(x['lb'], x['process'])}-{f(x['ub'], x['process'])}]", axis=1)

method_labels = dict(fixed_k = 'Fixed $k$', timescale_gap =  'TS Gap', worst = 'Fixed $k$ (Worst)')
selection['method'] = selection['method'].apply(lambda x: method_labels.get(x, x))

selection.rename(columns={'process': 'chosen_num_its', 'lag': 'chosen_lag'}, inplace=True)

Add in model definitions

In [67]:
hp_definitions.reset_index(inplace=True)
selection = selection.merge(hp_definitions, on=['hp_ix'], how='left')

Tidy up

In [68]:

# subset columns
cols =  ['hp_ix', 'chosen_lag',  'feature__value', 'distances__scheme', 'distances__transform', 
                    'distances__centre', 'distances__steepness','tica__lag', 'tica__dim', 'cluster__k', 'chosen_num_its',  'method']
selection = selection.loc[:, cols]


vamps.drop(labels=['median', 'lb', 'ub', 'count'], axis=1,  inplace=True)

Main formatting

In [73]:

# add in VAMP scores
table = selection.merge(vamps.loc[vamps.process.isin([2, 3, 4]), :], left_on=['hp_ix', 'chosen_lag'], right_on=['hp_ix', 'lag'])
table.rename(columns={'process': 'num_its'}, inplace=True)

# Pivot
table['num_its'] = table['num_its'].apply(lambda x: f"Loss(k={int(x)})")
table = table.pivot(index=cols, values='$\Delta$ VAMP-2', columns='num_its')



# Tidy index
table.reset_index(inplace=True)
table.columns.name = None
# table.drop(labels=[ 'hp_ix'], axis=1, inplace=True)


# Nice numbers - convert to angstroms/inverse angstroms first
table['distances__centre'] *= 10
table['distances__steepness'] /= 10
for col in ['distances__centre', 'distances__steepness']:
    table[col] = table[col].apply(lambda x: f"{x:4.1f}")

    
# Nice text
def safe_cap(x): 
    if not isinstance(x, str): 
        return ""
    else: 
        return x.capitalize()

for col in ['feature__value', 'distances__transform']:
    table[col] = table[col].apply(lambda x: safe_cap(x))

# Better scheme labels
scheme_labels = {'ca': r'C$\alpha$', 'closest-heavy': 'Closest-Heavy'}    
table['distances__scheme'] = table['distances__scheme'].apply(lambda x: scheme_labels.get(x, ''))

# Remove redundant values
for col in ['distances__scheme', 'distances__transform', 'distances__centre', 'distances__steepness']:
    table[col] = table.apply(lambda x: '-' if (x['feature__value']=='Dihedrals')  else x[col], axis=1)

for col in [ 'distances__centre', 'distances__steepness']:
    table[col] = table.apply(lambda x: '-' if (x['distances__transform']=='Linear')  else x[col], axis=1)



# Formatting
for col in ['chosen_lag', 'tica__lag', 'tica__dim', 'cluster__k',  'chosen_num_its',]:
    table[col] = table[col].astype(int)

def f(x):
    labels = {'protein': 'Protein', 'hp_rank': 'Rank', 'chosen_lag': 'Lag (ns)', 'k': 'Num. eigenvectors', 'feature__value': 'Feature',
         'distances__scheme': 'Contact scheme', 'distances__transform': 'Transform', 'distances__centre': r'Center (\si{\angstrom})', 
         'distances__steepness': r'Steepness (\si{\per\angstrom})', 'tica__lag': 'TICA lag (ns)', 'tica__dim': 'TICA dimension', 
              'cluster__k': 'Num. clusters', 'chosen_num_its': 'Num. Dominant', 
             'method': 'Method'}
    lab = labels.get(x)
    if lab is None:
        lab = x
    return lab

table.rename(columns=f, inplace=True)

table.sort_values(by=['Transform'], ascending=False, inplace=True)
table.sort_values(by=['Method', 'Feature'], inplace=True)

vamp_columns = list(table.filter(regex='Loss').columns)


table['Model no.'] = np.arange(1, table.shape[0]+1).astype(int)
table = table.loc[:, [ 'Model no.', 'Method', 'Lag (ns)', 
                     'Feature', 'Transform', 'Contact scheme', r'Center (\si{\angstrom})', r'Steepness (\si{\per\angstrom})', 'TICA lag (ns)', 
                     'TICA dimension', 'Num. clusters', 'Num. Dominant']+vamp_columns]
table = table.T
table.columns = table.loc['Model no.']
table.drop(labels='Model no.', axis=0, inplace=True)
print(table.to_latex(index=True, escape=False))

\begin{tabular}{llllllll}
\toprule
Model no. &                  1 &                  2 &                   3 &                     4 &                     5 &                     6 &                   7 \\
\midrule
Method                         &          Fixed $k$ &          Fixed $k$ &           Fixed $k$ &     Fixed $k$ (Worst) &                TS Gap &                TS Gap &              TS Gap \\
Lag (ns)                       &                 41 &                 41 &                  41 &                    41 &                    41 &                    41 &                  41 \\
Feature                        &          Dihedrals &          Distances &           Distances &             Dihedrals &             Dihedrals &             Distances &           Distances \\
Transform                      &                  - &           Logistic &              Linear &                     - &                     - &              Logistic &              Linear \\
Contact scheme   

  print(table.to_latex(index=True, escape=False))
