In [3]:
'''
This code is the Python implementation form Chapter 16 page 392ff from
Ben Lambert's book 'A Student's Guide to Bayesian Statisitcs'

We are using the same fake data here as in model_validation.ipynb but are
conducting a k-fold validation that splits the data into test and train datasets
'''

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pystan as stan
import stanity #wrapper for fit output of stan models to conduct LOO cross validation (there is no way to do WAIC in Python)
import seaborn as sns
from sklearn.model_selection import KFold
from scipy.stats import norm

#nice plot aesthetic
sns.set()  
plt.style.use('seaborn-darkgrid')

def kFold(StanModel,testIndices,Data):
    elpd_out = np.array([])
    for train_index, test_index in testIndices.split(Data): #for each of the kfold mutations
        X_train, X_test = Data[train_index], Data[test_index]  #get the data
        
        #Fit Stan model
        model = stan.StanModel(file=StanModel)
        fit = model.sampling(data={'NTest':2000,'NTrain':8000,'XTest':X_test,'XTrain':X_train},iter=200,chains=4)

        #Loglikelihood
        extracted = fit.extract()
        loo = stanity.psisloo(extracted['logLikelihood'])        
        elpd_out=np.append(elpd_out, loo.elpd)
        
    return elpd_out

def main():

    N=10000
    #Student-t distribution with nu=5
    X=np.random.standard_t(5,N)

    #Split sample kfold into test and training datasets
    testIndices = KFold(n_splits=5)
    
    #Run Stan models to get results
    loo1 = kFold('normal_kfolds.stan',testIndices,X)
    loo2 = kFold('student_kfolds.stan',testIndices,X)
    
    print("Model 1 Loo elpd ", np.sum(loo1))
    print("Model 2 Loo elpd ",np.sum(loo2))
    
    difference=np.sum(loo2)-np.sum(loo1)
    sd=np.sqrt(1000)*np.std(difference)
    pvalue=1-norm.cdf(difference/sd)
    print("Difference ",difference," SD ",sd, " pvalue ",pvalue)
    
    
main()

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_ae8bfac8cb31b97216547a3c8790ab83 NOW.
To run all diagnostics call pystan.check_hmc_diagnostics(fit)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_ae8bfac8cb31b97216547a3c8790ab83 NOW.
To run all diagnostics call pystan.check_hmc_diagnostics(fit)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_ae8bfac8cb31b97216547a3c8790ab83 NOW.
To run all diagnostics call pystan.check_hmc_diagnostics(fit)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_ae8bfac8cb31b97216547a3c8790ab83 NOW.
To run all diagnostics call pystan.check_hmc_diagnostics(fit)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_ae8bfac8cb31b97216547a3c8790ab83 NOW.
To run all diagnostics call pystan.check_hmc_diagnostics(fit)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_bc5a650de2bb7160bd55f3ba3933b865 NOW.
To run all diagnostics call pystan.check_hmc_diagnostics(fit)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_bc5a650d

Model 1 Loo elpd  -16711.110600400567
Model 2 Loo elpd  -16271.54229291516
Difference  439.5683074854078  SD  0.0  pvalue  0.0


