In [1]:
# Load packages
import os
import glob
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import plotly.express as px
import plotly.io as pio
pio.templates.default = "simple_white"

import sys
sys.path.append('..')
from src import util
from cmdstanpy import CmdStanModel
import cmdstanpy
# Specify CmdStan location via environment variable
# I used miniforge for environment management, which I dislike. 
# try to use poetry for environment management. 
os.environ['CMDSTAN'] = '/Users/henryqu/miniforge3/envs/cmdstan/bin/cmdstan'

In [2]:
ntep_qual = pd.read_csv("../model_data/quality_nj2.csv")
ntep_qual.head()

Unnamed: 0,PLT_ID,TRAIT,DATE,QUALITY,ENTRY_CODE,ENTRY_NAME,COMP,RATER,ROW,COL,PLOC_CODE,YEAR,MONTH,RATING_EVENT,RATING_EVENT_CODE,MONTH_CODE,YEAR_CODE,TEST_LOC
0,241,Turf Quality,4/18/18,4,37,NAI-14-132,Columbia River Seed,RMD,17,1,239,2018,Apr,RMD-2018-04-18,8,1,1,"Adelphia, NJ"
1,241,Turf Quality,5/10/18,4,37,NAI-14-132,Columbia River Seed,RON,17,1,239,2018,May,RON-2018-05-10,13,6,1,"Adelphia, NJ"
2,241,Turf Quality,6/7/18,5,37,NAI-14-132,Columbia River Seed,RON,17,1,239,2018,Jun,RON-2018-06-07,14,4,1,"Adelphia, NJ"
3,241,Turf Quality,7/17/18,5,37,NAI-14-132,Columbia River Seed,RON,17,1,239,2018,Jul,RON-2018-07-17,15,3,1,"Adelphia, NJ"
4,241,Turf Quality,8/15/18,6,37,NAI-14-132,Columbia River Seed,RMD,17,1,239,2018,Aug,RMD-2018-08-15,9,2,1,"Adelphia, NJ"


In [3]:
# Run LMER
md = smf.mixedlm("QUALITY ~ ENTRY_NAME", ntep_qual,groups=ntep_qual.DATE)
mdf = md.fit()

In [4]:
# Extract entry effects from LMM
summary_data = pd.DataFrame(mdf.summary().tables[1])
entry_lmm = summary_data.loc[[ix for ix in summary_data.index if ix.startswith("ENTRY")]].reset_index()
entry_lmm.columns=['ENTRY_NAME', 'EFF_LMM', 'STD_LMM', 'Z_LMM', 'P_LMM', '2.5LMM', '97.5LMM']
entry_lmm = entry_lmm.assign(ENTRY_NAME=entry_lmm.ENTRY_NAME.apply(lambda x: x.split('.')[-1][:-1]))
entry_lmm.head()

Unnamed: 0,ENTRY_NAME,EFF_LMM,STD_LMM,Z_LMM,P_LMM,2.5LMM,97.5LMM
0,A10-280,0.88,0.174,5.067,0.0,0.539,1.22
1,A11-26,2.019,0.174,11.627,0.0,1.678,2.359
2,A11-38,-1.546,0.174,-8.907,0.0,-1.887,-1.206
3,A11-40,0.269,0.174,1.547,0.122,-0.072,0.609
4,A12-34,-0.037,0.174,-0.213,0.831,-0.377,0.303


In [5]:
dist_matrix = util.calc_dist_matrix(ntep_qual.loc[ntep_qual.DATE==ntep_qual.DATE.unique()[0],
                                                  ['ROW', 'COL', 'PLOC_CODE']], 
                                    row_len=4, col_len=6,
                                    index_col='PLOC_CODE')
dist_matrix.shape

(267, 267)

In [6]:
fig = px.imshow(dist_matrix, origin='lower', labels={'color':'Distance'})
fig.show()

In [7]:
# get model data
model_data = util.get_model_data(ntep_qual, DIST=dist_matrix)

In [19]:
# load stan model
try : 
    model = CmdStanModel(stan_file='./src/no_consistent_rater_model_dist_matrix.stan',
                         cpp_options={'STAN_THREADS': 'true'})
except:
    cmdstanpy.rebuild_cmdstan()

INFO:cmdstanpy:found newer exe file, not recompiling


In [20]:
# fit the model
fit = model.sample(data=model_data, chains=4, parallel_chains=4, adapt_delta=0.99, max_treedepth=15,
                   refresh=1, iter_warmup=500, iter_sampling=1000, save_warmup=True, 
                   output_dir=f'../model_output/no_year_month/nj2', seed=123,
                   time_fmt='%Y%m%d')

INFO:cmdstanpy:created output directory: /Users/henryqu/Documents/GitHub/ntep-rating/model_output/no_year_month/NJ2
INFO:cmdstanpy:CmdStan start processing


chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

                                                                                                                                                                                                                                                                                                                                

INFO:cmdstanpy:CmdStan done processing.





In [21]:
fit.diagnose()



In [148]:
param_est = fit.summary([2.5, 97.5])

In [None]:
param_est = fit.summary([2.5, 97.5])