In [18]:
import pandas as pd
import patsy as pt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
%matplotlib inline
import re
import pymc3 as pm
import matplotlib.ticker as tk
import re
from sklearn.model_selection import StratifiedKFold
import pickle

## Import data

In [4]:
df = pd.read_csv('outputs/ala1_trials_clean.csv')
df = df.rename(columns={'project_name': 'basis', 'cluster__n_clusters': 'n', 'test_mean': 'y'}).\
loc[:, ['basis', 'y', 'n']]

## Scale predictors


In [6]:
to_scale = ['n']
scaler = preprocessing.MinMaxScaler()
vars_scaled = pd.DataFrame(scaler.fit_transform(df.loc[:, to_scale]), columns=[x+'_s' for x in to_scale])
df = df.join(vars_scaled)
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,327,328,329,330,331,332,333,334,335,336
basis,psi,rmsd,phipsi,phipsi,phi,positions,rmsd,phi,psi,positions,...,phipsi,rmsd,psi,positions,rmsd,rmsd,phipsi,rmsd,rmsd,rmsd
y,1.79637,1.70957,3.27798,3.28693,1.98877,3.29715,1.70869,1.98756,1.79831,3.16998,...,3.29081,1.71039,1.79486,3.31378,1.71251,1.70886,3.28592,1.71229,1.71155,1.70997
n,77,554,97,95,362,440,942,169,96,33,...,274,156,519,390,150,628,292,127,155,47
n_s,0.0638945,0.547667,0.0841785,0.0821501,0.352941,0.432049,0.941176,0.157201,0.0831643,0.0192698,...,0.263692,0.144016,0.51217,0.381339,0.137931,0.622718,0.281947,0.114604,0.143002,0.0334686


## Create design matrix

In [22]:
y = df.loc[:, 'y']
X = df.loc[:, df.columns.difference(['y'])]
X_c = pt.dmatrix('~ 0 + n_s + C(basis)', data=df, return_type='dataframe')
X_c = X_c.rename(columns=lambda x: re.sub('C|\\(|\\)|\\[|\\]','',x))

## Model fitting functions

In [16]:
def gamma(alpha, beta):
    def g(x):
        return pm.Gamma(x, alpha=alpha, beta=beta)
    return g

def hcauchy(beta):
    def g(x):
        return pm.HalfCauchy(x, beta=beta)
    return g


def fit_model_1(y, X, kernel_type='rbf'):
    """
    function to return a pymc3 model
    y : dependent variable
    X : independent variables
    prop_Xu : number of inducing varibles to use
    
    X, y are dataframes. We'll use the column names. 
    """
    with pm.Model() as model:
        # Covert arrays
        X_a = X.values
        y_a = y.values
        X_cols = list(X.columns)
        
        # Globals
        prop_Xu = 0.1 # proportion of observations to use as inducing variables
        l_prior = gamma(1, 0.05)
        eta_prior = hcauchy(2)
        sigma_prior = hcauchy(2)

        # Kernels
        # 3 way interaction
        eta = eta_prior('eta')
        cov = eta**2
        for i in range(X_a.shape[1]):
            var_lab = 'l_'+X_cols[i]
            if kernel_type=='RBF':
                cov = cov*pm.gp.cov.ExpQuad(X_a.shape[1], ls=l_prior(var_lab), active_dims=[i])
            if kernel_type=='Exponential':
                cov = cov*pm.gp.cov.Exponential(X_a.shape[1], ls=l_prior(var_lab), active_dims=[i])
            if kernel_type=='M52':
                cov = cov*pm.gp.cov.Matern52(X_a.shape[1], ls=l_prior(var_lab), active_dims=[i])
            if kernel_type=='M32':
                cov = cov*pm.gp.cov.Matern32(X_a.shape[1], ls=l_prior(var_lab), active_dims=[i])

        # Covariance model
        cov_tot = cov 

        # Model
        gp = pm.gp.MarginalSparse(cov_func=cov_tot, approx="FITC")

        # Noise model
        sigma_n =sigma_prior('sigma_n')

        # Inducing variables
        num_Xu = int(X_a.shape[0]*prop_Xu)
        Xu = pm.gp.util.kmeans_inducing_points(num_Xu, X_a)

        # Marginal likelihood
        y_ = gp.marginal_likelihood('y_', X=X_a, y=y_a,Xu=Xu, noise=sigma_n)
        mp = pm.find_MAP()
        
    return gp, mp, model

## Main testing loop

In [25]:
# Inputs
kernels =  ['M32', 'M52', 'RBF', 'Exponential' ]

# Outputs
pred_dfs = []

# iterator
kf = StratifiedKFold(n_splits=10)

for i in range(len(kernels)):
    print(kernels[i])
    for idx, (train_idx, test_idx) in enumerate(kf.split(X.values, X['basis'])):
        print('\tfold: {}'.format(idx))
        # subset dataframes for training and testin
        y_train = y.iloc[train_idx]
        X_train = X_c.iloc[train_idx, :]
        
        y_test = y.iloc[test_idx]
        X_test = X_c.iloc[test_idx, :]  
        
        # Fit gp model
        gp, mp, model = fit_model_1(y=y_train, X=X_train, kernel_type=kernels[i])

        # Get predictions for evalution
        with model:
            # predict latent
            mu, var = gp.predict(X_test.values, point=mp, diag=True,pred_noise=False)
            sd_f = np.sqrt(var)

            # predict target (includes noise)
            _, var = gp.predict(X_test.values, point=mp, diag=True,pred_noise=True)
            sd_y = np.sqrt(var)
            
        res = pd.DataFrame({'f_pred': mu, 'sd_f': sd_f, 'sd_y': sd_y, 'y': y_test.values})
        res.loc[:, 'kernel'] = kernels[i]
        res.loc[:, 'fold_num'] = idx
        
        pred_dfs.append(pd.concat([X_test.reset_index(), res.reset_index()], axis=1))

pred_dfs = pd.concat(pred_dfs)

null_mu = np.mean(y)
null_sd = np.std(y)

M32
	fold: 0


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
  result[diagonal_slice] = x
  result[diagonal_slice] = x
  result[diagonal_slice] = x
INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
  result[diagonal_slice] = x
  result[diagonal_slice] = x
  result[diagonal_slice] = x
  result[diagonal_slice] = x
INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
  result[diagonal_slice] = x
logp = 808.32, ||grad|| = 0.2187: 100%|██████████| 70/70 [00:00<00:00, 107.28it/s]   


	fold: 1


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
  result[diagonal_slice] = x
logp = 760.99, ||grad|| = 0.57331: 100%|██████████| 69/69 [00:00<00:00, 118.89it/s]  


	fold: 2


logp = 745.3, ||grad|| = 0.1638: 100%|██████████| 91/91 [00:01<00:00, 75.60it/s]            


	fold: 3


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
logp = 575.76, ||grad|| = 0.028576: 100%|██████████| 52/52 [00:00<00:00, 111.33it/s]  


	fold: 4


logp = 691.18, ||grad|| = 0.0030865: 100%|██████████| 42/42 [00:00<00:00, 108.33it/s]  


	fold: 5


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
logp = 767.26, ||grad|| = 0.66403: 100%|██████████| 69/69 [00:01<00:00, 62.42it/s]  


	fold: 6


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
logp = 711.21, ||grad|| = 0.017342: 100%|██████████| 44/44 [00:00<00:00, 115.46it/s]  


	fold: 7


logp = 756.97, ||grad|| = 0.054455: 100%|██████████| 65/65 [00:00<00:00, 96.31it/s]  


	fold: 8


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
logp = 719.25, ||grad|| = 0.018069: 100%|██████████| 54/54 [00:00<00:00, 104.79it/s] 


	fold: 9


logp = 733.43, ||grad|| = 0.0071888: 100%|██████████| 43/43 [00:00<00:00, 97.43it/s]  


M52
	fold: 0


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
  result[diagonal_slice] = x
  result[diagonal_slice] = x
  result[diagonal_slice] = x
logp = 1,014.8, ||grad|| = 0.0082084: 100%|██████████| 73/73 [00:00<00:00, 111.15it/s]  


	fold: 1


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
logp = 870.35, ||grad|| = 0.034839: 100%|██████████| 130/130 [00:01<00:00, 111.58it/s] 


	fold: 2


logp = 763.68, ||grad|| = 0.024541: 100%|██████████| 123/123 [00:01<00:00, 122.09it/s] 


	fold: 3


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
logp = 594.64, ||grad|| = 0.018949: 100%|██████████| 57/57 [00:00<00:00, 131.51it/s]  


	fold: 4


logp = 806.7, ||grad|| = 0.014414: 100%|██████████| 64/64 [00:00<00:00, 124.97it/s]  


	fold: 5


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
logp = 771.56, ||grad|| = 0.032437: 100%|██████████| 67/67 [00:00<00:00, 94.84it/s]   


	fold: 6


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
logp = 768.33, ||grad|| = 0.054804: 100%|██████████| 74/74 [00:00<00:00, 78.70it/s]  


	fold: 7


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
logp = 738.22, ||grad|| = 0.26296: 100%|██████████| 70/70 [00:00<00:00, 87.66it/s]  


	fold: 8


logp = 612.73, ||grad|| = 0.22063: 100%|██████████| 51/51 [00:00<00:00, 70.81it/s]  


	fold: 9


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
logp = 950.58, ||grad|| = 0.081094: 100%|██████████| 65/65 [00:00<00:00, 84.78it/s]  


RBF
	fold: 0


  result[diagonal_slice] = x
INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
  result[diagonal_slice] = x
logp = 972.09, ||grad|| = 0.026962: 100%|██████████| 94/94 [00:00<00:00, 126.86it/s]  


	fold: 1


logp = 896.18, ||grad|| = 0.097848: 100%|██████████| 105/105 [00:00<00:00, 139.20it/s] 


	fold: 2


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
logp = 600.11, ||grad|| = 13.34: 100%|██████████| 41/41 [00:00<00:00, 146.88it/s]   


	fold: 3


logp = 744.43, ||grad|| = 0.086138: 100%|██████████| 119/119 [00:00<00:00, 141.19it/s] 


	fold: 4


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
logp = 1,085.6, ||grad|| = 0.24609: 100%|██████████| 135/135 [00:00<00:00, 141.22it/s] 


	fold: 5


logp = 815.3, ||grad|| = 2.0013: 100%|██████████| 58/58 [00:00<00:00, 135.28it/s]   


	fold: 6


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
logp = 619, ||grad|| = 0.051206: 100%|██████████| 67/67 [00:00<00:00, 118.06it/s]   


	fold: 7


logp = 912.03, ||grad|| = 0.040297: 100%|██████████| 52/52 [00:00<00:00, 144.61it/s]  


	fold: 8


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
logp = 607.49, ||grad|| = 0.26752: 100%|██████████| 70/70 [00:00<00:00, 151.57it/s]  


	fold: 9


logp = 603.39, ||grad|| = 0.02703: 100%|██████████| 57/57 [00:00<00:00, 141.76it/s]  


Exponential
	fold: 0


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
  result[diagonal_slice] = x
  result[diagonal_slice] = x
logp = 721.01, ||grad|| = 0.012393: 100%|██████████| 53/53 [00:00<00:00, 111.48it/s]  


	fold: 1


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
logp = 632.33, ||grad|| = 0.109: 100%|██████████| 68/68 [00:00<00:00, 99.11it/s]      


	fold: 2


logp = 649.96, ||grad|| = 0.015095: 100%|██████████| 65/65 [00:00<00:00, 103.89it/s]  


	fold: 3


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
logp = 619.06, ||grad|| = 0.044871: 100%|██████████| 45/45 [00:00<00:00, 101.64it/s] 


	fold: 4


logp = 594.44, ||grad|| = 0.010825: 100%|██████████| 70/70 [00:00<00:00, 96.40it/s]   


	fold: 5


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
logp = 701.67, ||grad|| = 10.955: 100%|██████████| 34/34 [00:00<00:00, 105.69it/s]  


	fold: 6


logp = 596.4, ||grad|| = 0.008661: 100%|██████████| 44/44 [00:00<00:00, 119.60it/s]  


	fold: 7


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
logp = 622.37, ||grad|| = 0.0045917: 100%|██████████| 44/44 [00:00<00:00, 119.74it/s]  


	fold: 8


logp = 633.84, ||grad|| = 0.0029315: 100%|██████████| 48/48 [00:00<00:00, 103.11it/s] 


	fold: 9


INFO (theano.gof.compilelock): Refreshing lock /Users/robertarbon/.theano/compiledir_Darwin-19.3.0-x86_64-i386-64bit-i386-3.6.10-64/lock_dir/lock
logp = 635.39, ||grad|| = 0.0090169: 100%|██████████| 81/81 [00:00<00:00, 95.97it/s]  


## Evaluate kernels

In [29]:
def ll(f_pred, sigma_pred, y_true):
    # log predictive density
    tmp = 0.5*np.log(2*np.pi*sigma_pred**2)
    tmp += (f_pred-y_true)**2/(2*sigma_pred**2)
    return tmp


sll = ll(pred_dfs['f_pred'], pred_dfs['sd_y'], pred_dfs['y'])
sll = sll - ll(null_mu, null_sd, pred_dfs['y'])
pred_dfs['msll'] = sll
pred_dfs['smse'] = (pred_dfs['f_pred']-pred_dfs['y'])**2/np.var(y)
pred_dfs.to_pickle('outputs/kernel_cv_fits.p')

msll = pred_dfs.groupby(['kernel'])['msll'].mean()
smse = pred_dfs.groupby(['kernel'])['smse'].mean()

summary = pd.DataFrame(smse).join(other=pd.DataFrame(msll), on=['kernel'], how='left')
summary.to_csv('outputs/kernel_cv_fits_summary.csv')

In [31]:
summary

Unnamed: 0_level_0,smse,msll
kernel,Unnamed: 1_level_1,Unnamed: 2_level_1
Exponential,0.002721,-2.973339
M32,0.002529,-3.421816
M52,0.002311,-3.817193
RBF,0.00317,-4.123877
