In [30]:
import arviz as az
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import pystan
import seaborn as sns
import scipy as sp

# __Appendix 3__ | Modeling

### __Load models__

In [31]:
%run utility/stan_utility.py

#### Import data

In [32]:
names = ['streams', 'acousticness', 'danceability', 'energy', 'loudness', 'tempo', 'valence']
names_2018 = ['streams', 'acousticness', 'danceability', 'loudness', 'tempo', 'valence']

global_data  = pd.read_csv('./data/global_selected.csv',header=None, names=names)
global_data_2018 = pd.read_csv('./data/2018_global_selected.csv',header=None, names=names)

#### Select top 30 

In [33]:
global_data = global_data[0:30]
global_data_2018 = global_data_2018[0:30]

#### Log transform the $Y$:

In [34]:
global_data['streams'] = np.log10(global_data['streams'])

#### Define method for calculating and combining LOO scores:

In [35]:
def calculate_and_print_loo(fit, loos):
    prior_dict = fit.extract(pars=['a','b','sigma'], permuted=False)
    azfit = az.from_pystan(fit=fit, prior=prior_dict, 
                           observed_data='y', 
                           posterior_predictive='ypred', 
                           log_likelihood='log_lik')
    loo_m = az.loo(azfit)
    loo_m['Model'] = model_name
    loo_m['div'], loo_m['treedepth'], loo_m['energy'] = div_check, tree_check, energy_check
    loos = loos.append(loo_m, sort=True)
    
    return loo_m.set_index('Model'), loos 

#### Fit all 8 models:

In [41]:
model_filenames = ['lin_3_uninformative.stan',
                   'lin_3_informative.stan',
                   'lin_5_informative.stan',
                   'lin_5_informative-interaction.stan',
                   'lin_5_informative-interaction2.stan',
                   'lin_5_informative-second_order.stan',
                   'lin_5_informative-second_order-inter.stan',
                   'lin_5_informative-third_order-inter.stan']

model_names = ['Linear; 3 Predictors Uninformative',
               'Linear; 3 Predictors Weakly Informative, Normal',
               'Linear; 5 Predictors Weakly Informative, Normal',
               'Non-Linear; 5 Predictors Weakly Informative inter 1, Normal',
               'Non-Linear; 5 Predictors Weakly Informative inter 2, Normal',
               'Non-Linear; 5 Predictors Weakly Informative squared, Normal',
               'Non-Linear; 5 Predictors Weakly Informative squared inter, Normal',
               'Non-Linear; 5 Predictors Weakly Informative cubic, Normal']

loos = pd.DataFrame(index=[], columns=["loo", "warning", "treedepth", "div"])
nums_b = [3,3,5,7,9,12,14,19]

In [None]:
for filename, num_b, model_name in zip(model_filenames, nums_b, model_names):
    with open(filename, 'r') as file:
        
        gb_2017 = global_data.values
        gb_2018 = global_data_2018.values 

        div_check, tree_check, energy_check = pystan.diagnostics.check_div(fit, verbose = 2), pystan.diagnostics.check_treedepth(fit, verbose = 2), pystan.diagnostics.check_energy(fit, verbose = 2)        
                    
        data = dict(N=gb_2017.shape[0],
                    y=gb_2017[:,0],
                    X1=gb_2017[:,1], X2=gb_2017[:,2], X3=gb_2017[:,4], X4=gb_2017[:,5], X5=gb_2017[:,6],
                    Npred=gb_2018.shape[0], 
                    X1pred=gb_2018[:,1], X2pred=gb_2018[:,2], X3pred=gb_2018[:,3], X4pred=gb_2018[:,4], X5pred=gb_2018[:,5],
                    mu_a=0,
                    mu_b=np.zeros(num_b),
                    sigma_0=0.12*10)

        model = compile_model(filename, model_name=model_name) 
        fit = model.sampling(data=data, seed=2222)
        
        p, loos = calculate_and_print_loo(fit,loos)
        

#### Results

In [55]:
loos_file = loos.set_index('Model')
loos_file

Unnamed: 0_level_0,div,energy,loo,loo_se,p_loo,treedepth,warning
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Linear; 3 Predictors Uninformative,False,True,330.12607,1193.251565,1216.707605,True,1
"Linear; 3 Predictors Weakly Informative, Normal",True,True,414.275619,1201.617354,1270.207832,True,1
"Linear; 5 Predictors Weakly Informative, Normal",True,True,568.040652,1219.729017,1318.150582,True,1
"Non-Linear; 5 Predictors Weakly Informative inter 1, Normal",True,True,1692.046913,1611.779674,1805.222313,True,1
"Non-Linear; 5 Predictors Weakly Informative inter 2, Normal",True,True,3072.952117,1970.499454,2411.396402,True,1
"Non-Linear; 5 Predictors Weakly Informative squared, Normal",True,True,2255.69728,1633.00488,2001.678876,True,1
"Non-Linear; 5 Predictors Weakly Informative squared inter, Normal",True,True,3231.741395,1890.545766,2446.777666,True,1
"Non-Linear; 5 Predictors Weakly Informative cubic, Normal",True,True,7528.847211,3188.444969,4436.048235,True,1


#### Save to file

In [54]:
loos_file.to_csv('model_comparison.csv')