In [1]:
import pyemma as pm
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import pandas as pd
import functions as funcs
import pickle
import time
import seaborn as sns

In [127]:
m1_selection = pd.read_hdf('./summaries/m1_model_selection.h5')
m2_selection = pd.read_hdf('./summaries/m2_model_selection.h5')
m3_selection = pd.read_hdf('./summaries/m3_model_selection.h5')
chosen_k = m1_selection.loc[:, ['protein', 'chosen_num_its']].drop_duplicates()

m2_selection = m2_selection.merge(chosen_k, on=['protein'], how='left')

In [128]:
vamps = pd.read_hdf('./summaries/vamps_bs_summary.h5')
vamps['VAMP-2 score'] = vamps.apply(lambda x: f"{x['median']:4.2f}, [{x['median']-x['lb_diff']:4.2f}-{x['median']+x['ub_diff']:4.2f}]", axis=1)
vamps.drop(labels=['median', 'lb_diff', 'ub_diff'], axis=1, inplace=True)

In [135]:
m1_selection['Method'] = 'Fixed $k$'
m2_selection['Method'] = 'TS Gap'
m3_selection['Method'] = 'Fixed $k$ (worst)'


# subset columns
cols =  ['protein', 'hp_index', 'chosen_lag',  'feature__value', 'distances__scheme', 'distances__transform', 
                    'distances__centre', 'distances__steepness','tica__lag', 'tica__dim', 'cluster__k', 'chosen_num_its', 'new_num_its', 'Method']
selection = pd.concat([m1_selection.loc[:, cols], m2_selection.loc[:, cols], m3_selection.loc[:, cols]], axis=0)


In [204]:




# add in VAMP scores
table = selection.merge(vamps.loc[vamps.num_its.isin([2, 3, 4]), :], left_on=['protein', 'hp_index', 'chosen_lag'], right_on=['protein', 'hp_index', 'lag'])

# Pivot
table['num_its'] = table['num_its'].apply(lambda x: f"VAMP-2(k={int(x)})")
table = table.pivot(index=cols, values='VAMP-2 score', columns='num_its')

# Tidy index
table.reset_index(inplace=True)
table.columns.name = None
table.drop(labels=[ 'hp_index'], axis=1, inplace=True)


# Nice numbers - convert to angstroms/inverse angstroms first
table['distances__centre'] *= 10
table['distances__steepness'] /= 10
for col in ['distances__centre', 'distances__steepness']:
    table[col] = table[col].apply(lambda x: f"{x:4.1f}")

# Nice text
for col in ['feature__value', 'distances__transform']:
    table[col] = table[col].apply(lambda x: x.capitalize())

# Better scheme labels
scheme_labels = {'ca': r'C$\alpha$', 'closest-heavy': 'Closest-Heavy'}
table['distances__scheme'] = table['distances__scheme'].apply(lambda x: scheme_labels[x])

# Remove redundant values
for col in ['distances__scheme', 'distances__transform', 'distances__centre', 'distances__steepness']:
    table[col] = table.apply(lambda x: '-' if (x['feature__value']=='Dihedrals')  else x[col], axis=1)

for col in [ 'distances__centre', 'distances__steepness']:
    table[col] = table.apply(lambda x: '-' if (x['distances__transform']=='Linear')  else x[col], axis=1)



# Formatting
for col in ['chosen_lag', 'tica__lag', 'tica__dim', 'cluster__k',  'chosen_num_its', 'new_num_its']:
    table[col] = table[col].astype(int)

def f(x):
    labels = {'protein': 'Protein', 'hp_rank': 'Rank', 'chosen_lag': 'Lag (ns)', 'k': 'Num. eigenvectors', 'feature__value': 'Feature',
         'distances__scheme': 'Contact scheme', 'distances__transform': 'Transform', 'distances__centre': r'Center (\si{\angstrom})', 
         'distances__steepness': r'Steepness (\si{\per\angstrom})', 'tica__lag': 'TICA lag (ns)', 'tica__dim': 'TICA dimension', 
              'cluster__k': 'Num. clusters', 'chosen_num_its': '$k_{\mathrm{fixed}}$', 'new_num_its': '$k_{\mathrm{gap}}$'}
    lab = labels.get(x)
    if lab is None:
        lab = x
    return lab

table.rename(columns=f, inplace=True)

table.sort_values(by=['Method', 'Protein', 'Feature'], inplace=True)

In [206]:
vamp_columns = list(table.filter(regex='^VAMP').columns)

for protein in table.Protein.unique():

    tmp = table.loc[table.Protein==protein,:].copy()
    tmp['Model no.'] = np.arange(1, tmp.shape[0]+1).astype(int)
    tmp = tmp.loc[:, [ 'Model no.', 'Method', 'Lag (ns)', 
                         'Feature', 'Contact scheme', 'Transform', r'Center (\si{\angstrom})', r'Steepness (\si{\per\angstrom})', 'TICA lag (ns)', 
                         'TICA dimension', 'Num. clusters', '$k_{\mathrm{fixed}}$', '$k_{\mathrm{gap}}$']+vamp_columns]
    # t1 = tmp.iloc[:4, :].reset_index(drop=True)
    # t2 = tmp.iloc[4:, :].reset_index(drop=True)
    # print(t2)
    # tmp = pd.concat([t1, t2], axis=1).T
    # tmp['Parameter'] = t.index
    # ncols = tmp.shape[1]
    # tmp = tmp.iloc[:, [ncols-1]+list(range(0, ncols-1))]

    # tmp.columns = ['Parameter']+['']*(tmp.shape[1]-1)
    # tmp.fillna('-', inplace=True)
    tmp = tmp.T
    tmp.columns = ['']*(tmp.shape[1])

    print(tmp.to_latex(index=True, escape=False))

\begin{tabular}{llllllll}
\toprule
{} \\
\midrule
Model no.                      &                  1 &                  2 &                  3 &                  4 &                  5 &                  6 &                  7 \\
Method                         &          Fixed $k$ &          Fixed $k$ &          Fixed $k$ &  Fixed $k$ (worst) &             TS Gap &             TS Gap &             TS Gap \\
Lag (ns)                       &                 40 &                 40 &                 40 &                 40 &                 40 &                 40 &                 40 \\
Feature                        &          Dihedrals &          Distances &          Distances &          Distances &          Dihedrals &          Distances &          Distances \\
Contact scheme                 &                  - &          C$\alpha$ &          C$\alpha$ &      Closest-Heavy &                  - &      Closest-Heavy &          C$\alpha$ \\
Transform                      &             