In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from os.path import join
import pickle
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import RepeatedKFold, ShuffleSplit
from sklearn.pipeline import Pipeline
from pyemma.coordinates.clustering import NDGrid
from pyemma.msm import MaximumLikelihoodMSM
import warnings 
warnings.filterwarnings(action='ignore')


In [2]:
def bootstrap_ci(x, ci=(0.05,.95), n_boot=1000):
    medians = []
    for i in range(n_boot):
        medians.append(np.median(np.random.choice(x,size=x.shape[0], replace=True)))
    medians = np.sort(np.array(medians))
    med_ci = -medians[int(ci[0]*n_boot)]+medians[int(ci[1]*n_boot)]

    means = []
    for i in range(n_boot):
        means.append(np.mean(np.random.choice(x,size=x.shape[0], replace=True)))
    means = np.sort(np.array(means))
    mean_ci = -means[int(ci[0]*n_boot)]+means[int(ci[1]*n_boot)]
    
    return med_ci, mean_ci


In [3]:
stride=10
lag = 30
lag_in_strides = int(lag/stride)

In [4]:
paths = ['data/100.0pc/quad_well_{:02d}.npy'.format(x) for x in range(100)]
# paths
X1 = [np.load(x) for x in paths]

len(X1)
X1[0].shape

(84401, 1)

In [5]:
X1 = [x[::stride,:] for x in X1]

In [6]:
X2 = [y for x in X1 for y in np.array_split(x,2)]
X4 = [y for x in X1 for y in np.array_split(x,4)]
X8 = [y for x in X1 for y in np.array_split(x,8)]
X16 = [y for x in X1 for y in np.array_split(x,16)]
X32 = [y for x in X1 for y in np.array_split(x,32)]
X64 = [y for x in X1 for y in np.array_split(x,64)]
X128 = [y for x in X1 for y in np.array_split(x,128)]

In [7]:
data = {'1-Splits': X1, '2-Splits': X2, '4-Splits':  X4, '8-Splits': X8, '16-Splits': X16, 
       '32-Splits': X32,'64-Splits': X64, '128-Splits': X128}

In [8]:
model = Pipeline([('cluster', NDGrid(min=-1.2, max=1.2, n_bins_per_feature=100)),
                   ('msm', MaximumLikelihoodMSM(lag=lag_in_strides,score_method='vampe', score_k=2))])

In [9]:
cv = ShuffleSplit(test_size=0.5, n_splits=20)

In [10]:
search = GridSearchCV(model, cv=cv, param_grid = {'cluster__n_bins_per_feature': [10,11,12,13]})

In [11]:
search.fit(X1)

GridSearchCV(cv=ShuffleSplit(n_splits=20, random_state=None, test_size=0.5, train_size=None),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cluster', NDGrid(max=1.2, min=-1.2, n_bins_per_feature=100)), ('msm', MaximumLikelihoodMSM(connectivity='largest', count_mode='sliding',
           dt_traj='1 step', lag=3, maxerr=1e-08, maxiter=1000000,
           mincount_connectivity='1/n', reversible=True, score_k=2,
           score_method='vampe', sparse=False, statdist_constraint=None))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'cluster__n_bins_per_feature': [10, 11, 12, 13]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [12]:
results1 = pd.DataFrame(search.cv_results_)

In [13]:
search.fit(X128)

GridSearchCV(cv=ShuffleSplit(n_splits=20, random_state=None, test_size=0.5, train_size=None),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cluster', NDGrid(max=1.2, min=-1.2, n_bins_per_feature=100)), ('msm', MaximumLikelihoodMSM(connectivity='largest', count_mode='sliding',
           dt_traj='1 step', lag=3, maxerr=1e-08, maxiter=1000000,
           mincount_connectivity='1/n', reversible=True, score_k=2,
           score_method='vampe', sparse=False, statdist_constraint=None))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'cluster__n_bins_per_feature': [10, 11, 12, 13]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [14]:
results128 = pd.DataFrame(search.cv_results_)

In [15]:
results1

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_cluster__n_bins_per_feature,params,rank_test_score,split0_test_score,split0_train_score,split10_test_score,...,split7_test_score,split7_train_score,split8_test_score,split8_train_score,split9_test_score,split9_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.056653,0.040854,1.925331,1.926497,10,{'cluster__n_bins_per_feature': 10},2,1.925145,1.925089,1.923682,...,1.926367,1.926516,1.925746,1.925082,1.927559,1.925466,0.01516,0.029599,0.001913,0.000865
1,0.052033,0.030618,1.905084,1.906112,11,{'cluster__n_bins_per_feature': 11},4,1.905143,1.904642,1.903552,...,1.906533,1.905695,1.905279,1.904838,1.907145,1.905224,0.006459,0.00408,0.001933,0.000862
2,0.055825,0.035205,1.925802,1.926979,12,{'cluster__n_bins_per_feature': 12},1,1.925614,1.925566,1.924155,...,1.92681,1.927023,1.926175,1.925579,1.928026,1.925945,0.009502,0.006918,0.001911,0.000853
3,0.061057,0.037207,1.915345,1.916388,13,{'cluster__n_bins_per_feature': 13},3,1.915242,1.915001,1.913603,...,1.916565,1.916212,1.91569,1.915015,1.917309,1.915613,0.013858,0.011163,0.00189,0.000818


In [16]:
results128

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_cluster__n_bins_per_feature,params,rank_test_score,split0_test_score,split0_train_score,split10_test_score,...,split7_test_score,split7_train_score,split8_test_score,split8_train_score,split9_test_score,split9_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.444126,0.322199,1.925881,1.926515,10,{'cluster__n_bins_per_feature': 10},2,1.926912,1.925695,1.925737,...,1.924983,1.927303,1.925666,1.926361,1.925813,1.92644,0.041779,0.047358,0.000707,0.000589
1,0.412858,0.333439,1.905731,1.906325,11,{'cluster__n_bins_per_feature': 11},4,1.906922,1.905315,1.905509,...,1.904973,1.906996,1.905439,1.906285,1.905397,1.906516,0.046326,0.047022,0.000762,0.000643
2,0.555056,0.416262,1.926368,1.926971,12,{'cluster__n_bins_per_feature': 12},1,1.927388,1.926161,1.926259,...,1.925524,1.927705,1.926109,1.926876,1.92629,1.926903,0.107102,0.097636,0.00069,0.000567
3,0.671866,0.498669,1.915989,1.916547,13,{'cluster__n_bins_per_feature': 13},3,1.917293,1.915432,1.915986,...,1.914975,1.91746,1.915715,1.91648,1.915765,1.916636,0.226926,0.182923,0.000746,0.000615


In [17]:
model.set_params(msm__score_method='vamp2')

Pipeline(memory=None,
     steps=[('cluster', NDGrid(max=1.2, min=-1.2, n_bins_per_feature=100)), ('msm', MaximumLikelihoodMSM(connectivity='largest', count_mode='sliding',
           dt_traj='1 step', lag=3, maxerr=1e-08, maxiter=1000000,
           mincount_connectivity='1/n', reversible=True, score_k=2,
           score_method='vamp2', sparse=False, statdist_constraint=None))])

In [18]:
search = GridSearchCV(model, cv=cv, param_grid = {'cluster__n_bins_per_feature': [10,11,12,13]})

In [19]:
search.fit(X1)

GridSearchCV(cv=ShuffleSplit(n_splits=20, random_state=None, test_size=0.5, train_size=None),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cluster', NDGrid(max=1.2, min=-1.2, n_bins_per_feature=100)), ('msm', MaximumLikelihoodMSM(connectivity='largest', count_mode='sliding',
           dt_traj='1 step', lag=3, maxerr=1e-08, maxiter=1000000,
           mincount_connectivity='1/n', reversible=True, score_k=2,
           score_method='vamp2', sparse=False, statdist_constraint=None))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'cluster__n_bins_per_feature': [10, 11, 12, 13]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [20]:
results_vamp2 = pd.DataFrame(search.cv_results_)

In [21]:
results_vamp2

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_cluster__n_bins_per_feature,params,rank_test_score,split0_test_score,split0_train_score,split10_test_score,...,split7_test_score,split7_train_score,split8_test_score,split8_train_score,split9_test_score,split9_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.060999,0.038424,1.926335,1.9267,10,{'cluster__n_bins_per_feature': 10},2,1.92707,1.926,1.925875,...,1.924927,1.928136,1.927798,1.925269,1.925831,1.927175,0.019789,0.009277,0.000862,0.000861
1,0.055672,0.037171,1.905986,1.906373,11,{'cluster__n_bins_per_feature': 11},4,1.905999,1.906421,1.905493,...,1.904619,1.907787,1.906949,1.905474,1.905768,1.90654,0.00908,0.010874,0.000738,0.000746
2,0.053734,0.032042,1.926827,1.927159,12,{'cluster__n_bins_per_feature': 12},1,1.927557,1.926461,1.926364,...,1.925422,1.928591,1.928258,1.925759,1.926309,1.927647,0.006108,0.004893,0.00085,0.00085
3,0.055869,0.033503,1.916254,1.916658,13,{'cluster__n_bins_per_feature': 13},3,1.916527,1.916435,1.915833,...,1.915023,1.917926,1.917104,1.915861,1.915981,1.916894,0.005623,0.004959,0.000728,0.000724


In [22]:
X = [np.load(x) for x in paths]



In [23]:
model.set_params(msm__lag=30)

Pipeline(memory=None,
     steps=[('cluster', NDGrid(max=1.2, min=-1.2, n_bins_per_feature=100)), ('msm', MaximumLikelihoodMSM(connectivity='largest', count_mode='sliding',
           dt_traj='1 step', lag=30, maxerr=1e-08, maxiter=1000000,
           mincount_connectivity='1/n', reversible=True, score_k=2,
           score_method='vamp2', sparse=False, statdist_constraint=None))])

In [24]:
search = GridSearchCV(model, cv=cv, param_grid = {'cluster__n_bins_per_feature': [10,11,12,13]})
search.fit(X)

GridSearchCV(cv=ShuffleSplit(n_splits=20, random_state=None, test_size=0.5, train_size=None),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cluster', NDGrid(max=1.2, min=-1.2, n_bins_per_feature=100)), ('msm', MaximumLikelihoodMSM(connectivity='largest', count_mode='sliding',
           dt_traj='1 step', lag=30, maxerr=1e-08, maxiter=1000000,
           mincount_connectivity='1/n', reversible=True, score_k=2,
           score_method='vamp2', sparse=False, statdist_constraint=None))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'cluster__n_bins_per_feature': [10, 11, 12, 13]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [25]:
results_nostride = pd.DataFrame(search.cv_results_)

In [26]:
results_nostride

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_cluster__n_bins_per_feature,params,rank_test_score,split0_test_score,split0_train_score,split10_test_score,...,split7_test_score,split7_train_score,split8_test_score,split8_train_score,split9_test_score,split9_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.506178,0.325052,1.92634,1.926775,10,{'cluster__n_bins_per_feature': 10},2,1.925781,1.927336,1.926196,...,1.926317,1.926803,1.928078,1.925009,1.926426,1.926691,0.119178,0.124835,0.000766,0.000768
1,0.502216,0.269792,1.906279,1.90665,11,{'cluster__n_bins_per_feature': 11},4,1.905981,1.906944,1.906415,...,1.906102,1.906839,1.907522,1.905375,1.906541,1.906392,0.157454,0.060482,0.000654,0.000656
2,0.545912,0.333645,1.926772,1.927196,12,{'cluster__n_bins_per_feature': 12},1,1.926222,1.92775,1.926613,...,1.926755,1.927218,1.928536,1.925406,1.926856,1.927114,0.166506,0.076023,0.000762,0.000764
3,0.675318,0.447391,1.916458,1.916846,13,{'cluster__n_bins_per_feature': 13},3,1.916092,1.91721,1.916587,...,1.916328,1.916987,1.917769,1.915497,1.916649,1.916658,0.260624,0.219087,0.00067,0.000672
