In [9]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path
import seaborn as sns
import plotly.express as px
import functions as funcs
import pyemma as pm
from pandas.api.types import CategoricalDtype

import matplotlib as mpl

In [56]:
def zero_var(x):
    if x.dtype=='object':
        return np.unique(x).shape[0] == 1
    else:
        return np.var(x) < 1e-12

# Load data

In [10]:
data_dir = Path('/Volumes/REA/Data/fast_folders/')


'vamps_and_hps' are the vamp-2 scores at the selected value of lag and num_its for each protein but with each bootstrap iteration

In [70]:
vamps = pd.read_hdf('vamps_and_hps.h5', key='vamps_hps')

Chosen lags and num_dom_procs are the specific values of markov lag time and number of dominant processes we're going to use in this analysis. 

In [13]:
chosen_lags = pd.read_hdf('chosen_lag_times.h5', key='chosen_lags')
chosen_dom_procs = pd.read_hdf('chosen_num_dominant.h5', key='chosen_num_dominant')

# Subset timescales dataframe

'timescales' contains all the timescale data. Subset the timescales for the specific lag and only keep the dominant timescales. 

In [62]:
ts = pd.read_hdf('timescales.h5', key='timescales')


lags_dict = dict(zip(chosen_lags['protein'], chosen_lags['lag']))
proc_dict = dict(zip(chosen_dom_procs['protein'], chosen_dom_procs['num_its']))

ts['choose_lag'] = ts['protein'].apply(lambda x: lags_dict[x])
ts['choose_k'] = ts['protein'].apply(lambda x: proc_dict[x])
ts['choose_method'] = 'VAMP2'

ts = ts.loc[(ts.lag == ts.choose_lag) & (ts.num_its <= ts.choose_k+1), : ]

ts = ts.drop(columns=ts.filter(like='choose', axis=1).columns)
ts = ts.drop(columns=ts.columns[ts.apply(zero_var, axis=0)])

# Aggregate boostrap samples

We want the median values for vamps and timescales

In [65]:
non_num_cols = list(ts.columns[ts.dtypes == 'object'])
agg_columns = ['protein', 'num_its', 'hp_index']

tmp = ts.groupby(agg_columns, as_index=False).median()
ts = tmp.merge(ts.loc[:, list(set(non_num_cols+agg_columns))], on=agg_columns, how='outer')
ts = ts.drop(columns=['iteration'])
ts.head()

Unnamed: 0,protein,num_its,hp_index,value,lag,cluster__k,distances__centre,distances__steepness,tica__dim,tica__lag,distances__transform,feature__value,distances__scheme
0,BBA,2.0,0.0,1501.259543,40.0,191.0,1.402167,23.955992,8.0,76.0,logistic,dihedrals,ca
1,BBA,2.0,0.0,1501.259543,40.0,191.0,1.402167,23.955992,8.0,76.0,logistic,dihedrals,ca
2,BBA,2.0,0.0,1501.259543,40.0,191.0,1.402167,23.955992,8.0,76.0,logistic,dihedrals,ca
3,BBA,2.0,0.0,1501.259543,40.0,191.0,1.402167,23.955992,8.0,76.0,logistic,dihedrals,ca
4,BBA,2.0,0.0,1501.259543,40.0,191.0,1.402167,23.955992,8.0,76.0,logistic,dihedrals,ca


In [71]:
vamps = vamps.drop(columns=vamps.columns[vamps.apply(zero_var, axis=0)])
non_num_cols = list(vamps.columns[vamps.dtypes == 'object'])
agg_columns = ['protein', 'hp_index']

tmp = vamps.groupby(agg_columns, as_index=False).median()
vamps = tmp.merge(vamps.loc[:, list(set(non_num_cols+agg_columns))], on=agg_columns, how='outer')
# vamp = vamp.drop(columns=['cluster__max_iter', 'cluster__stride', 'iteration','tica__stride'])
vamps.head()

Unnamed: 0,protein,hp_index,value,lag,k,iteration,cluster__k,distances__centre,distances__steepness,tica__dim,tica__lag,choose_lag,choose_k,feature,distances__transform,feature__value,distances__scheme,protein_dir
0,BBA,0.0,3.721257,40.0,4.0,50.5,191.0,1.402167,23.955992,8.0,76.0,40.0,4.0,Dihedral angles,logistic,dihedrals,ca,1fme
1,BBA,0.0,3.721257,40.0,4.0,50.5,191.0,1.402167,23.955992,8.0,76.0,40.0,4.0,Dihedral angles,logistic,dihedrals,ca,1fme
2,BBA,0.0,3.721257,40.0,4.0,50.5,191.0,1.402167,23.955992,8.0,76.0,40.0,4.0,Dihedral angles,logistic,dihedrals,ca,1fme
3,BBA,0.0,3.721257,40.0,4.0,50.5,191.0,1.402167,23.955992,8.0,76.0,40.0,4.0,Dihedral angles,logistic,dihedrals,ca,1fme
4,BBA,0.0,3.721257,40.0,4.0,50.5,191.0,1.402167,23.955992,8.0,76.0,40.0,4.0,Dihedral angles,logistic,dihedrals,ca,1fme
