In [1]:
from osprey.config import Config
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ShuffleSplit
from msmbuilder.feature_selection import VarianceThreshold
from msmbuilder.decomposition import tICA
from msmbuilder.cluster import MiniBatchKMeans
from msmbuilder.msm import MarkovStateModel
from os.path import join
from glob import glob
import numpy as np




In [2]:
# Globals
config_path = '../../Trial Data/DHFR/Random-GMRQ-2/alpha_angle.yaml'
db_path = '../../Trial Data/DHFR/Random-GMRQ-2/osprey-trials.db'
traj_dir = '/home/robert/Datasets/DHFR/train'

In [3]:
def get_pipeline(parameters):
    """
    Wrapper so that new instance of a pipeline can be instantiated for every fold. 
    :return: sklean.pipeline.Pipeline object
    """
    pipe = Pipeline([('variance_cut', VarianceThreshold()),
                     ('tica', tICA(kinetic_mapping=True)),
                     ('cluster', MiniBatchKMeans()),
                     ('msm', MarkovStateModel(use_gap='timescales', lag_time=50, verbose=True))])
    pipe.set_params(**parameters)

    return pipe

In [15]:
# cross validation iterator
# TODO get this from the config file
# cv:
#     name: shufflesplit
#     params:
#       n_splits: 5
#       test_size: 0.5
cv = ShuffleSplit(n_splits=2, test_size=0.5)

In [5]:
def get_trajectories(feat):
    """
    Gets the trajctories associated with a feature
    :param feat: 
    :return: 
    """
    traj_paths = glob(join(traj_dir, feat, '*'))
    trajs = [np.load(traj_path) for traj_path in traj_paths]
    return trajs

In [6]:
def get_parameters(irow):
    """
    Gets the parameters for running a new model. 
    :return: dictionary of parameters
    """
    i, row = irow
    params_dict = row['parameters']

    params = {}
    trial_config = {}

    params['tica__lag_time'] = params_dict['tica__lag_time']
    params['tica__n_components'] = params_dict['tica__n_components']
    params['cluster__n_clusters'] = params_dict['cluster__n_clusters']

    trial_config['params'] = params
    trial_config['feature'] = row['project_name']
    trial_config['row'] = i

    return trial_config

In [16]:
np.random.seed(42)
config = Config(config_path)
trials = config.trial_results()
trials = trials.iloc[:3,:]

Loading config file:     ../../Trial Data/DHFR/Random-GMRQ-2/alpha_angle.yaml...
Loading trials database: sqlite:///osprey-trials.db...


In [8]:
    trials

Unnamed: 0,completed,config_sha1,elapsed,host,id,mean_test_score,mean_train_score,n_test_samples,n_train_samples,parameters,project_name,started,status,test_scores,traceback,train_scores,user
0,2017-08-03 20:16:52.451687,11d05ee0f2efe3233a58a40ac81bbb9962bf882a,1970-01-01 00:02:15.233066,compute249.bc4.acrc.priv,2,2.613855,2.922517,"[75000, 75000, 75000, 75000, 75000]","[75000, 75000, 75000, 75000, 75000]","{'cluster__random_state': None, 'tica__lag_tim...",alpha_angle,2017-08-03 20:14:37.218621,SUCCEEDED,"[2.7929254738960707, 2.806366141062694, 2.6499...",,"[2.8238005249605154, 2.870721463825274, 2.9884...",ra15808
1,2017-08-03 20:32:19.317085,11d05ee0f2efe3233a58a40ac81bbb9962bf882a,1970-01-01 00:17:41.918205,compute249.bc4.acrc.priv,3,2.56115,2.970864,"[75000, 75000, 75000, 75000, 75000]","[75000, 75000, 75000, 75000, 75000]","{'cluster__random_state': None, 'tica__lag_tim...",alpha_angle,2017-08-03 20:14:37.398880,SUCCEEDED,"[2.7309232422787275, 2.5163258459570557, 2.645...",,"[2.96773553090778, 2.969495071361183, 2.995966...",ra15808


In [17]:
new_trial_params = [get_parameters(irow) for irow in trials.iterrows()]
np.random.shuffle(new_trial_params)

In [18]:
new_trial_params

[{'feature': 'alpha_angle',
  'params': {'cluster__n_clusters': 101,
   'tica__lag_time': 3,
   'tica__n_components': 1},
  'row': 0},
 {'feature': 'alpha_angle',
  'params': {'cluster__n_clusters': 719,
   'tica__lag_time': 408,
   'tica__n_components': 8},
  'row': 1},
 {'feature': 'alpha_angle',
  'params': {'cluster__n_clusters': 537,
   'tica__lag_time': 120,
   'tica__n_components': 5},
  'row': 2}]

In [19]:
def run_trial(X, params):
    
    train_scores = []
    train_gaps = []
    train_n_timescales = []
    
    test_scores = []
    for idx, (train_idx, test_idx) in enumerate(cv.split(X)):
        pipe = get_pipeline(params)
        
        train = [X[idx] for idx in train_idx]
        pipe.fit(train)
        
        train_n_timescales.append(pipe.named_steps['msm'].n_timescales)
        train_gaps.append(pipe.named_steps['msm'].gap_)
        train_scores.append(pipe.score(train))
        
        test = [X[idx] for idx in test_idx]
        try:
            score = pipe.score(test)
        except:
            score = None
        test_scores.append(score)
        
    return train_scores, train_gaps, train_n_timescales, test_scores


In [12]:
all_train_scores = []
all_train_gaps = []
all_train_n_timescales = []
all_test_scores = []
index = []
for trial in new_trial_params:
    X = get_trajectories(trial['feature'])
    
    tr_scores, tr_gaps, tr_n_ts, te_scores = run_trial(X, trial['params'])
    
    all_train_scores.append(tr_scores)
    all_train_gaps.append(tr_gaps)
    all_train_n_timescales.append(tr_n_ts)
    
    all_test_scores.append(te_scores)
    index.append(trial['row'])



MSM contains 22 strongly connected components above weight=0.02. Component 1 selected, with population 14.666667%
Setting n_timescales to 1 with a timescales gap of 1.08e+01
MSM contains 22 strongly connected components above weight=0.02. Component 1 selected, with population 14.666667%
MSM contains 16 strongly connected components above weight=0.02. Component 0 selected, with population 65.333333%




MSM contains 19 strongly connected components above weight=0.02. Component 6 selected, with population 25.333333%
Setting n_timescales to 1 with a timescales gap of 3.82e+01
MSM contains 19 strongly connected components above weight=0.02. Component 6 selected, with population 25.333333%
MSM contains 13 strongly connected components above weight=0.02. Component 3 selected, with population 42.665263%




MSM contains 20 strongly connected components above weight=0.02. Component 0 selected, with population 30.666667%
Setting n_timescales to 2 with a timescales gap of 4.46e+00
MSM contains 20 strongly connected components above weight=0.02. Component 0 selected, with population 30.666667%
MSM contains 18 strongly connected components above weight=0.02. Component 5 selected, with population 35.995789%
MSM contains 1 strongly connected component above weight=0.02. Component 0 selected, with population 100.000000%
Setting n_timescales to 1 with a timescales gap of 1.23e+01
MSM contains 1 strongly connected component above weight=0.02. Component 0 selected, with population 100.000000%
MSM contains 1 strongly connected component above weight=0.02. Component 0 selected, with population 100.000000%
MSM contains 3 strongly connected components above weight=0.02. Component 1 selected, with population 64.000000%
Setting n_timescales to 4 with a timescales gap of 2.69e+00
MSM contains 3 strongly co

In [13]:
all_train_scores

[[1.9988869054738618, 1.999295988839076, 2.9945379464631934],
 [1.9995452060717138, 4.5115554889826299, 1.9973411518267319]]

In [14]:
index


[1, 0]