In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path
import seaborn as sns
from pandas.api.types import CategoricalDtype
import matplotlib as mpl
import pymc3 as pm
import scipy as sp
import pickle
from sklearn.preprocessing import MinMaxScaler
import patsy as pt



In [2]:
def gamma(alpha, beta):
    def g(x):
        return pm.Gamma(x, alpha=alpha, beta=beta)
    return g

def hcauchy(beta):
    def g(x):
        return pm.HalfCauchy(x, beta=beta)
    return g


def fit_gp(y, X, l_prior, eta_prior, sigma_prior, kernel_type='M52', bayes_kws=dict(draws=1000, tune=1000, chains=2, cores=1), prop_Xu=None):
    """
    function to return a pymc3 model
    y : dependent variable
    X : independent variables
    prop_Xu : number of inducing varibles to use. If None, use full marginal likelihood. If not none, use FTIC. 
    bayes_kw : kws for pm.sample
    X, y are dataframes. We'll use the column names. 
    """
    kernel_type = kernel_type.lower()
    with pm.Model() as model:
        # Covert arrays
        X_a = X.values
        y_a = y.values.flatten()
        X_cols = list(X.columns)

        
        # Kernels
        # 3 way interaction
        eta = eta_prior('eta')
        cov = eta**2
        for i in range(X_a.shape[1]):
            var_lab = 'l_'+X_cols[i]
            if kernel_type=='rbf':
                cov = cov*pm.gp.cov.ExpQuad(X_a.shape[1], ls=l_prior(var_lab), active_dims=[i])
            if kernel_type=='exponential':
                cov = cov*pm.gp.cov.Exponential(X_a.shape[1], ls=l_prior(var_lab), active_dims=[i])
            if kernel_type=='m52':
                cov = cov*pm.gp.cov.Matern52(X_a.shape[1], ls=l_prior(var_lab), active_dims=[i])
            if kernel_type=='m32':
                cov = cov*pm.gp.cov.Matern32(X_a.shape[1], ls=l_prior(var_lab), active_dims=[i])

        # Covariance model
        cov_tot = cov 
        
        # Noise model
        sigma_n =sigma_prior('sigma_n')

        # Model
        if not (prop_Xu is None):
            # Inducing variables
            num_Xu = int(X_a.shape[0]*prop_Xu)
            Xu = pm.gp.util.kmeans_inducing_points(num_Xu, X_a)
            gp = pm.gp.MarginalSparse(cov_func=cov_tot, approx="FITC")
            y_ = gp.marginal_likelihood('y_', X=X_a, y=y_a, Xu=Xu, noise=sigma_n)
        else:
            gp = pm.gp.Marginal(cov_func=cov_tot)
            y_ = gp.marginal_likelihood('y_', X=X_a, y=y_a, noise=sigma_n)
            
        
        if not (bayes_kws is None):
            trace = pm.sample(**bayes_kws)
            result = trace
        else:
            mp = pm.find_MAP()
            result = mp
            
    return gp, result, model

In [3]:
protein = '1fme'
lag=41
proc=2

feat = 'distances'
trans = 'linear'

In [14]:
data_cols = ['median', 'tica__dim', 'tica__lag', 'cluster__k', 'feature__value', 'distances__scheme', 'distances__transform', 
             'distances__steepness', 'distances__centre'
]
var_names_short = ['ts', 'dim', 'lag', 'states', 'feat', 'scheme', 'trans', 'steep', 'cent']
name_dict = dict(zip(data_cols, var_names_short))
scaling = dict(dim=[1, 20], lag=[1, 100],states=[10, 500], steep=[0, 50], cent=[0, 1.5])

Load data

In [15]:
summary_path = f'{protein}/summary.h5'
hp_path = '../experiments/hpsample.h5'

timescales = pd.read_hdf(summary_path, key='timescales')
vamps = pd.read_hdf(summary_path, key='vamps')
gaps = pd.read_hdf(summary_path, key='timescale_ratio')
gaps.reset_index(inplace=True)
timescales.reset_index(inplace=True)
vamps.reset_index(inplace=True)
hps = pd.read_hdf(hp_path)
hps.reset_index(inplace=True)

Create main data DF

In [16]:
data = timescales.query(f"process=={proc}").query(f'lag=={lag}')
data = data.merge(hps, on=['hp_ix'], how='left')
data = data.loc[:, data_cols+['hp_ix']]
data.rename(mapper=name_dict, axis=1, inplace=True)

Create scaled and subsetted DF

In [23]:
formula = "ts ~  dim + lag + states + scheme"
if (feat == 'distances') & (trans == 'logistics'): 
    formula += " + steep + cent"
    
X = data.query(f"(feat == '{feat}')")
if trans is not None: 
    X = X.query(f"trans == '{trans}'")


ydf, Xdf = pt.dmatrices(formula, data=X, return_type='dataframe', NA_action='raise')
scaler = MinMaxScaler(feature_range=(0, 1))
yX = np.concatenate([ydf.values, Xdf.values], axis=1)
scaler.fit(yX)
yX_s = scaler.transform(yX)
data_s = pd.DataFrame(yX_s, columns = list(ydf.columns) + list(Xdf.columns))
data_s.drop(labels=['Intercept'], axis=1, inplace=True) # Drop the intercept because we're in an exponential
y = data_s.iloc[:, [0]]
X = data_s.iloc[:, 1:]

In [26]:
l_prior = gamma(2, 0.5)
eta_prior = hcauchy(2)
sigma_prior = hcauchy(2)
gp, trace, model = fit_gp(y=y, X=X,  # Data
                                l_prior=l_prior, eta_prior=eta_prior, sigma_prior=sigma_prior,  # Priors
                                kernel_type='exponential',  # Kernel
                                prop_Xu=None,  # proportion of data points which are inducing variables.
                                bayes_kws=dict(draws=5000, tune=1000, chains=4, cores=4, target_accept=0.90))  # Bayes kws

results = {'gp': gp, 'trace': trace, 'model': model, 'data': data_s}


You can find the C code in this temporary file: /var/folders/b1/0b53wj0509376104__2tttb00000gn/T/theano_compilation_error__akkyq70


Exception: ("Compilation failed (return status=1): In file included from /Users/robertarbon/.theano/compiledir_macOS-10.16-x86_64-i386-64bit-i386-3.8.0-64/tmpyen9jgqz/mod.cpp:1:. In file included from /Users/robertarbon/opt/miniconda3/envs/msmsense/include/python3.8/Python.h:25:. /Users/robertarbon/opt/miniconda3/envs/msmsense/bin/../include/c++/v1/stdio.h:107:15: fatal error: 'stdio.h' file not found. #include_next <stdio.h>.               ^~~~~~~~~. 1 error generated.. ", 'FunctionGraph(Elemwise{log,no_inplace}(TensorConstant{2.0}))')