In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
from typing import *

In [2]:
from sensetools.plots import feature_labeller

In [5]:
protein = '1fme'
md_t2 = {'1fme': 18*1000}
# order = [(53, 'fixed_k'), (60, 'fixed_k'), (52, 'fixed_k'), (47, 'worst'), (81, 'timescale_gap'), (60, 'timescale_gap'), (86, 'timescale_gap')]
lag = 41

In [73]:
summary_path = f'{protein}/summary.h5'

selection = pd.read_hdf(summary_path, key='model_selection')
timescales = pd.read_hdf(summary_path, key='timescales')
vamps = pd.read_hdf(summary_path, key='vamps')
gaps = pd.read_hdf(summary_path, key='timescale_ratio')
gaps.reset_index(inplace=True)
timescales.reset_index(inplace=True)
vamps.reset_index(inplace=True)

In [76]:
hp_samples = pd.read_hdf('../data/msms/hpsample.h5')

hp_samples.reset_index(inplace=True)

hp_samples['feature'] = hp_samples.apply(feature_labeller, axis=1)
vamps = vamps.merge(hp_samples.loc[:, ['feature', 'hp_ix']], on='hp_ix')
timescales = timescales.merge(hp_samples.loc[:, ['feature', 'hp_ix']], on='hp_ix')

In [77]:
def top_performers_by_feature(vamps: pd.DataFrame, k: int, lag: int)-> Dict[int, int]:
    df = vamps.loc[(vamps.process==k) & (vamps.lag==lag), ['hp_ix', 'median', 'feature']].copy()
    df['rank'] = df.groupby(['feature'])['median'].rank(ascending=False)
    df.sort_values(by='rank', inplace=True)
    d = dict(zip(df['hp_ix'], df['rank']))
    return d

def top_performers(vamps: pd.DataFrame, k: int, lag: int)-> Dict[int, int]:
    df = vamps.loc[(vamps.process==k) & (vamps.lag==lag), ['hp_ix', 'median']].copy()
    df['rank'] = df['median'].rank(ascending=False)
    df.sort_values(by='rank', inplace=True)
    d = dict(zip(df['hp_ix'], df['rank']))
    return d


def plot_val_by_mod_proc(ax, value_df, color='by_proc'): 
    for (mod_n, proc), df in value_df.groupby(['rank', 'process']):
        if mod_n == 1: 
            label = f"{proc}"
        else: 
            label = None
        
        if color=='by_proc':
            col = cols[proc-2]
        else: 
            col = color
        ax.hlines(df['median'],  mod_n-width, mod_n+width, color=col, label=label)
        
        ax.fill_between([mod_n-width, mod_n+width], df['lb'], df['ub'], alpha=0.1, color=col)


In [1]:
for k in range(2, 20):
    rank_by_ix = top_performers_by_feature(vamps, k=k, lag=lag)

    top_vamps = vamps.loc[(vamps.process==k) & (vamps.lag==lag), :].copy()
    top_timescales = timescales.loc[timescales.lag == lag, :].copy()

    top_vamps[f'rank_by_feature'] = top_vamps['hp_ix'].apply(lambda x: rank_by_ix.get(x))
    top_timescales[f'rank_by_feature'] = top_timescales['hp_ix'].apply(lambda x: rank_by_ix.get(x))

    top_timescales = top_timescales.loc[top_timescales.rank_by_feature==1, :]
    top_vamps = top_vamps.loc[top_vamps.rank_by_feature==1, :]

    rank_by_ix = top_performers(vamps, k=k, lag=lag)

    top_vamps[f'rank'] = top_vamps['hp_ix'].apply(lambda x: rank_by_ix.get(x))
    top_timescales[f'rank'] = top_timescales['hp_ix'].apply(lambda x: rank_by_ix.get(x))

    top_vamps['median'] = 100*(1-top_vamps['median']/k)
    top_vamps['lb'] = 100*(1-top_vamps['lb']/k)
    top_vamps['ub'] = 100*(1-top_vamps['ub']/k)
    
    with sns.plotting_context('talk', font_scale=1): 
        width =0.4
        offset = 0.5
        fig, axes = plt.subplots(2, 1, figsize=(8, 8), sharex=True)

        cols = sns.color_palette('colorblind', timescales.process.max())

        vamp_ax = axes[0]

        plot_val_by_mod_proc(vamp_ax, top_vamps, color='k')
        vamp_ax.yaxis.set_major_locator(mpl.ticker.MultipleLocator(1))
        vamp_ax.set_ylabel('VAMP-2(k) loss (%)')
        # axes[0].set_yscale('log')

        time_ax = axes[1]
        plot_val_by_mod_proc(time_ax, top_timescales.query('process <= 5'))
        time_ax.set_yscale('log')
        time_ax.set_ylabel('Timescale (ns)')

        for feature, df in top_timescales.groupby('feature'): 
            
            time_ax.annotate(text=f"{feature}\n{df.hp_ix.values[0]}", xy=(df['rank'].values[0], df.loc[df.process==2, 'ub']*1.1), 
                             verticalalignment='bottom', 
                              horizontalalignment='center', bbox={'facecolor': 'white', 'linewidth': 0}, fontsize=12)

        xlim = time_ax.get_xlim()    
        time_ax.hlines(md_t2[protein], *xlim, color='k',ls='dashed')
        time_ax.set_xlim(xlim)
        
        axes[-1].set_xlabel('Model rank.')
        axes[0].set_title(f'Ranked by VAMP-2(k={k})')
#         axes[0].legend(bbox_to_anchor=(1, 1), loc='upper left')
        axes[0].grid()
        axes[1].grid()
        for ax in axes.flatten():
            ax.xaxis.set_major_locator(mpl.ticker.MultipleLocator(2))

        plt.tight_layout()
        plt.savefig(f'{protein}/{protein}_timescale_comarison_by_rank_k_{k}.png', bbox_inches='tight')

NameError: name 'top_performers_by_feature' is not defined