In [1]:
import h5py 
import pandas as pd
import matplotlib.pyplot as plt
from sensetools.plots import vamps_and_features

In [2]:
summary = '../data/msms/1fme/summary.h5'
hp_definitions = '../data/msms/hpsample.h5'

with h5py.File(summary, 'r') as f:
    grp = f['summary']
    lag = int(grp.attrs['chosen_lag'])
    k = int(grp.attrs['chosen_k'])
    name = grp.attrs['protein_name']


In [86]:
vamps = vamps_and_features(summary, hp_definitions)
vamps.reset_index(level=['lag'], inplace=True)
vamps = vamps.loc[vamps.lag == lag, :]
vamps = vamps.drop(labels=['lag'], axis=1)

In [87]:
gaps = pd.DataFrame(pd.read_hdf(summary, key='timescale_ratio'))
gaps.reset_index(level=['lag'], inplace=True)
gaps = gaps.loc[gaps.lag == lag, :]
gaps = gaps.drop(labels=['lag'], axis=1)
gaps.dropna(inplace=True)

suffixes = ['_vamp', '_gap']
vamps = vamps.merge(gaps, left_index=True, right_index=True, how='inner', suffixes=suffixes)
vamps.reset_index(inplace=True)

In [88]:
def show_top(df, cutoff=0.03):
    # # Only keep top vamp scores per process
    # df['rank_vamp_per_proc'] = df.groupby(['process'], as_index=False)['median_vamp'].rank(ascending=False)
    # top = df.loc[df['rank_vamp_per_proc']<=cutoff,:].copy()
    keep_ix = 1 - df['median_vamp']/df['process'] < cutoff
    top = df.loc[keep_ix, :].copy()
    
    # of these rank the gaps
    top['rank_gap_per_feature'] = top.groupby(['feature'])['median_gap'].rank(ascending=False)
    
    return top.loc[top['rank_gap_per_feature']==1, :]
    

In [92]:
show_top(vamps, cutoff=0.02)

Unnamed: 0,hp_ix,process,median_vamp,lb_vamp,ub_vamp,count_vamp,feature,median_gap,lb_gap,ub_gap,count_gap,rank_gap_per_feature
386,86,2,1.961894,1.953491,1.968646,100,logit(dist.),11.807128,9.849849,15.667964,100,1.0
532,60,2,1.963484,1.95705,1.969161,100,dist.,2.340882,1.156287,5.23808,100,1.0
677,81,2,1.96024,1.951286,1.966929,100,dihed.,11.900826,9.905592,15.723409,100,1.0


In [93]:
vamps['vamp_rank_per_feature'] = vamps.groupby(['process','feature'], as_index=False)['median_vamp'].rank(ascending=False)
vamps.sort_values(by=['process', 'vamp_rank_per_feature'])

Unnamed: 0,hp_ix,process,median_vamp,lb_vamp,ub_vamp,count_vamp,feature,median_gap,lb_gap,ub_gap,count_gap,vamp_rank_per_feature
274,53,2,1.970976,1.962065,1.986006,100,dihed.,2.269628,1.087330,20.312834,100,1.0
532,60,2,1.963484,1.957050,1.969161,100,dist.,2.340882,1.156287,5.238080,100,1.0
1290,52,2,1.967703,1.961617,1.973702,100,logit(dist.),1.557833,1.122729,6.393267,100,1.0
951,33,2,1.967338,1.958334,1.980434,100,logit(dist.),1.698410,1.040743,37.363467,100,2.0
1252,8,2,1.963457,1.958161,1.972797,100,dist.,1.490528,1.109227,10.612373,100,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
721,40,20,12.414276,11.471247,13.265532,100,dihed.,1.035172,1.005005,1.112661,100,35.0
330,48,20,11.976475,10.892145,13.009526,100,dihed.,1.035188,1.004825,1.132446,100,36.0
1114,80,20,11.686761,10.624179,12.676681,100,dihed.,1.042192,1.006675,1.138558,100,37.0
1436,90,20,11.221847,10.457821,12.215291,99,dihed.,1.058211,1.003907,1.183888,99,38.0


In [47]:
vamps.loc[vamps.feature=='dist.', :].sort_values(by='median_gap', ascending=False)

Unnamed: 0,hp_ix,process,median_vamp,lb_vamp,ub_vamp,count_vamp,feature,median_gap,lb_gap,ub_gap,count_gap,rank_vamp_per_proc,rank_gap_per_feature
1196,12,2,1.955640,1.948739,1.963764,100,dist.,5.078195,2.863688,7.795656,100,53.0,1.0
1232,16,2,1.959732,1.951614,1.966626,100,dist.,4.112129,1.536300,6.889376,100,39.0,2.0
1225,57,2,1.954012,1.945903,1.962375,100,dist.,3.950333,1.712783,6.765889,100,58.0,3.0
1544,88,2,1.927643,1.916612,1.941842,100,dist.,3.074728,1.499165,4.796777,100,91.0,4.0
1078,59,2,1.943997,1.934144,1.952701,100,dist.,2.859638,1.265975,4.670444,100,77.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
619,42,20,16.649397,15.992891,17.114722,100,dist.,1.027893,1.004791,1.101074,100,21.0,318.0
1165,27,20,15.505645,14.946321,16.162843,100,dist.,1.027686,1.005087,1.093741,100,44.0,319.0
1164,27,19,14.853619,14.330597,15.485474,100,dist.,1.027619,1.005646,1.118836,100,44.0,320.0
892,97,19,15.452758,14.972868,15.958639,100,dist.,1.025853,1.004489,1.110358,100,25.0,321.0
