In [22]:
import os
import random
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from cmdstanpy import CmdStanModel
import cmdstanpy
# Specify CmdStan location via environment variable
os.environ['CMDSTAN'] = '/Users/henryqu/miniforge3/envs/cmdstan/bin/cmdstan'

import plotly.io as pio
pio.templates.default = "simple_white"

import utils
from nteprsm.constants import MONTH2ABBR
ABBR2MONTH = {value: key for key, value in MONTH2ABBR.items()}

# Simulation for parameter recovery

## set parameters for simulation

In [24]:
T = 5      # Total Number of years
K = 12     # Total Number of months that ratings were conducted
J = 10     # Total Number of cultivars evaluated
M = 9      # Total Number of categories 
R = 5      # Total Number of Raters
sigma = 0.7
alpha = 0.15
inv_rho = 0.4
sigma_e = 0.2

In [25]:
## trial layout
layout = pd.read_excel('./model_data/demo_layout.xlsx', sheet_name='layout', index_col=0)
layout = utils.transform_layout_to_long_format(layout, 'PLOC_CODE')
layout = layout.assign(ENTRY_CODE=random.sample(range(1,J+1), J) + random.sample(range(1,J+1), J)
                       + random.sample(range(1,J+1), J))
ploc_effs = pd.read_excel('./model_data/demo_layout.xlsx', sheet_name='ploc_eff', index_col=0)
ploc_effs = utils.transform_layout_to_long_format(ploc_effs, 'PLOC_EFF')
layout = pd.merge(layout, ploc_effs, on=['ROW', 'COL'])
layout = layout.sort_values(by='PLOC_CODE')

In [26]:
fig = go.Figure()
fig.add_trace(go.Heatmap(z=layout.PLOC_EFF, y=layout.ROW, x=layout.COL, 
                         text=['Cultivar-'+ str(i) for i in layout.ENTRY_CODE],
                         texttemplate="%{text}", textfont={"size": 10},
                         hovertemplate = 'Col # : %{x}<br>' +
                                         'Row # : %{y}<br>' + 
                                         'Cultivar : %{text}<br>'+
                                         'Plot Loc Effect: %{z:.2f}<extra></extra>',
                         colorbar=dict(title='Plot Location <br>Effects'),      
                        ))
fig.update_layout(title='Examplary Layout', 
                  xaxis = dict(title='Column #', dtick=1),
                  yaxis = dict(title='Row #', dtick=1), 
                  autosize=False, width=500, height=500,
                 )
fig.show()

In [27]:
# simulate plot location effect
dist_matrix = utils.calc_dist_matrix(layout, index_col='PLOC_CODE')
kernel = utils.distmatrix_to_expquadkernel(dist_matrix, alpha, inv_rho, sigma_e)
plot_effect = pd.DataFrame(np.random.multivariate_normal(layout.PLOC_EFF, kernel, T*K), 
                           columns=list(range(1,3*J+1)))

In [28]:
# entry effect
entry_effect = pd.DataFrame({'ENTRY_CODE': list(range(1, J+1)), 
                             'ENTRY_EFF': np.random.normal(0, sigma, J)})    

# rating severity 
rating_event_code = np.random.choice(list(range(1,R+1)), T*K)
beta = np.linspace(-0.8,0.8,R)

In [29]:
betas = pd.DataFrame({'RATING_EVENT_CODE': rating_event_code, 
                      'BETA': [beta[i-1] for i in rating_event_code]  })

In [30]:
thetas = pd.concat([plot_effect, betas], axis=1) #year_month_effect, 
thetas = thetas.melt(id_vars=['RATING_EVENT_CODE', 'BETA'], 
                     var_name='PLOC_CODE', value_name='PLOC_EFF')
thetas = pd.merge(thetas, layout[['ENTRY_CODE', 'PLOC_CODE']], on='PLOC_CODE')
thetas = pd.merge(thetas, entry_effect, on='ENTRY_CODE')
thetas = thetas.assign(THETA=thetas.ENTRY_EFF+thetas.PLOC_EFF)
thetas['PLOC_CODE'] = thetas['PLOC_CODE'].astype(int)

In [31]:
# category threshold 
taus = np.linspace(-2,2,M-1)
# simulate rating scores
thetas['QUALITY'] = thetas.apply(lambda x: util.simulate_rating_scores(x['THETA'], x['BETA'], taus), axis=1)
thetas.head()

Unnamed: 0,RATING_EVENT_CODE,BETA,PLOC_CODE,PLOC_EFF,ENTRY_CODE,ENTRY_EFF,THETA,QUALITY
0,2,-0.4,1,-0.948406,9,-1.181069,-2.129475,2
1,1,-0.8,1,-0.780259,9,-1.181069,-1.961328,3
2,3,0.0,1,-0.482946,9,-1.181069,-1.664014,2
3,5,0.8,1,-0.932907,9,-1.181069,-2.113975,1
4,5,0.8,1,-1.072637,9,-1.181069,-2.253705,1


In [43]:
taus

array([-2.        , -1.42857143, -0.85714286, -0.28571429,  0.28571429,
        0.85714286,  1.42857143,  2.        ])

In [32]:
px.histogram(x=thetas.QUALITY)

In [56]:
thetas.to_csv('./model_data/data_for_param_recover.csv', index=False)

## Estimate parameters via the model

In [34]:
# get model data
model_data = utils.get_model_data(thetas, DIST=dist_matrix)

In [35]:
# load stan model
try : 
    model = CmdStanModel(stan_file='./src/no_consistent_rater_model_dist_matrix.stan',
                         cpp_options={'STAN_THREADS': 'true'})
except:
    cmdstanpy.rebuild_cmdstan()

INFO:cmdstanpy:found newer exe file, not recompiling


In [36]:
### fit the model
fit = model.sample(data=model_data, chains=4, parallel_chains=4, adapt_delta=0.99, max_treedepth=15,
                   refresh=1, iter_warmup=500, iter_sampling=1000, save_warmup=True, 
                   output_dir='./model_output/param_recover', seed=123,
                   time_fmt='%Y%m%d')

INFO:cmdstanpy:CmdStan start processing


chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

                                                                                                                                                                                                                                                                                                                                

INFO:cmdstanpy:CmdStan done processing.





In [37]:
fit.diagnose()



In [51]:
sum_data = fit.summary()
# max of Rhat value
sum_data['R_hat'].max()

1.0

In [59]:
sum_data.head()

Unnamed: 0_level_0,Mean,MCSE,StdDev,5%,50%,95%,N_Eff,N_Eff/s,R_hat
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
lp__,-3000.0,0.21,6.4,-3000.0,-3000.0,-3000.0,900.0,0.61,1.0
beta_free[1],-0.79,0.00058,0.04,-0.85,-0.79,-0.72,4887.0,3.3,1.0
beta_free[2],-0.32,0.00054,0.042,-0.39,-0.32,-0.25,5961.0,4.1,1.0
beta_free[3],0.00045,0.00049,0.038,-0.064,-0.00023,0.064,6216.0,4.2,1.0
beta_free[4],0.34,0.00056,0.039,0.28,0.34,0.41,4797.0,3.3,1.0


In [52]:
param_est = fit.summary([2.5, 97.5])
masks = [ix for ix in param_est.index if any(p in ix for p in ['beta', 'tau', 'plot', 'sigma']) 
         and 'free' not in ix]
param_est = param_est.loc[masks, ['Mean', '2.5%', '97.5%']]
param_gen=np.hstack((sigma, sigma_e, beta, taus, layout.PLOC_EFF.values))
discrepency = param_est.subtract(param_gen, axis=0)
discrepency['error_minus'] = discrepency.Mean - discrepency['2.5%']
discrepency['error_plus'] = discrepency['97.5%'] - discrepency.Mean

In [57]:
discrepency.to_csv('./model_output/dataframes/parameter_discrepency.csv')

In [54]:
fig = px.scatter(discrepency , y=discrepency.index, x='Mean', error_x='error_plus', error_x_minus='error_minus',
                height=1000, labels={'name': 'Parameters', 'Mean': 'Estimated Values - Original Values'})
fig.add_vline(x=0,line_width=3, line_dash="dash",)
fig.show()