In [223]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.templates.default = "simple_white"
pio.renderers.default = 'iframe'
from datetime import datetime, timedelta

import sys
sys.path.append('..')
from src import util

def is_leapday(date):
    return (date.month==2) & (date.day==29)
def get_increment(date):
    yesterday = date - timedelta(days=1)
    increment = 0.5 if (is_leapday(date) or is_leapday(yesterday)) else 1
    return increment
def calc_date_distance(target: datetime, anchor:datetime):
    date_range = pd.date_range(anchor, target, inclusive='right')
    # modify increment change from 1 to 0.5 before and after leap day
    increments = [get_increment(d) for d in date_range]
    return sum(increments)
def calc_adjusted_date_dist(dates: pd.Series, name='DATE_CODE'):
    """for ease of interpretation, we always use the last day of a given year as anchor 
    """
    start_date = datetime(dates.min().year, 1, 1)
    end_date = datetime(dates.max().year, 12, 31)
    dates_data = pd.DataFrame(pd.date_range(start_date, end_date, name='DATE'))
    dates_data[name]=np.cumsum([get_increment(d) for d in dates_data.DATE])
    return dates_data 

In [192]:
data = pd.read_csv('../model_data/quality_nj2.csv')
data['DATE'] = pd.to_datetime(data.DATE)
data.head()

Unnamed: 0,PLT_ID,TRAIT,DATE,QUALITY,ENTRY_CODE,ENTRY_NAME,COMP,RATER,ROW,COL,PLOC_CODE,YEAR,MONTH,RATING_EVENT,RATING_EVENT_CODE,MONTH_CODE,YEAR_CODE,TEST_LOC
0,241,Turf Quality,2018-04-18,4,37,NAI-14-132,Columbia River Seed,RMD,17,1,239,2018,Apr,RMD-2018-04-18,8,1,1,"Adelphia, NJ"
1,241,Turf Quality,2018-05-10,4,37,NAI-14-132,Columbia River Seed,RON,17,1,239,2018,May,RON-2018-05-10,13,6,1,"Adelphia, NJ"
2,241,Turf Quality,2018-06-07,5,37,NAI-14-132,Columbia River Seed,RON,17,1,239,2018,Jun,RON-2018-06-07,14,4,1,"Adelphia, NJ"
3,241,Turf Quality,2018-07-17,5,37,NAI-14-132,Columbia River Seed,RON,17,1,239,2018,Jul,RON-2018-07-17,15,3,1,"Adelphia, NJ"
4,241,Turf Quality,2018-08-15,6,37,NAI-14-132,Columbia River Seed,RMD,17,1,239,2018,Aug,RMD-2018-08-15,9,2,1,"Adelphia, NJ"


In [273]:
entry_sample = np.random.choice(data.ENTRY_NAME.unique(), 5)
entry_sample

array(['A13-1', 'DLFPS-340/3364', 'NAI-15-80', 'A16-2', 'Babe'],
      dtype=object)

In [274]:
px.scatter(data[data.ENTRY_NAME.isin(entry_sample)].sort_values(by='DATE'), 
                x='DATE', y='QUALITY', color='ENTRY_NAME', trendline='lowess', trendline_options=dict(frac=0.1))

In [194]:
# date distance were calculated using the last day of the year before the year on which the trial was established.

In [197]:
dates_data = calc_adjusted_date_dist(data.DATE)
dates_data.head()

Unnamed: 0,DATE,DATE_CODE
0,2018-01-01,1.0
1,2018-01-02,2.0
2,2018-01-03,3.0
3,2018-01-04,4.0
4,2018-01-05,5.0


In [198]:
data = pd.merge(data, dates_data, on='DATE')
data.head()

Unnamed: 0,PLT_ID,TRAIT,DATE,QUALITY,ENTRY_CODE,ENTRY_NAME,COMP,RATER,ROW,COL,PLOC_CODE,YEAR,MONTH,RATING_EVENT,RATING_EVENT_CODE,MONTH_CODE,YEAR_CODE,TEST_LOC,DATE_CODE
0,241,Turf Quality,2018-04-18,4,37,NAI-14-132,Columbia River Seed,RMD,17,1,239,2018,Apr,RMD-2018-04-18,8,1,1,"Adelphia, NJ",108.0
1,240,Turf Quality,2018-04-18,5,25,BAR PP 7236V,Barenbrug Research,RMD,16,1,238,2018,Apr,RMD-2018-04-18,8,1,1,"Adelphia, NJ",108.0
2,211,Turf Quality,2018-04-18,6,59,DLFPS-340/3455,DLF Pickseed USA,RMD,15,1,209,2018,Apr,RMD-2018-04-18,8,1,1,"Adelphia, NJ",108.0
3,210,Turf Quality,2018-04-18,3,75,A15-6,Peak Plant Genetics,RMD,14,1,208,2018,Apr,RMD-2018-04-18,8,1,1,"Adelphia, NJ",108.0
4,181,Turf Quality,2018-04-18,3,31,NAI-15-80,SiteOne Landscape Supply,RMD,13,1,179,2018,Apr,RMD-2018-04-18,8,1,1,"Adelphia, NJ",108.0


In [258]:
px.violin(x=data.DATE, y=data.QUALITY, color=data.RATER)

The prior knowledge for the annual turfgrass growth is summarized using product of periodic and squared exponential covariance function (with period 365):
$$K_d(\cdot) = \sigma_d^2 exp({-\frac{2\sin^2 (\pi(d-d')/365)}{l_{d}^2}})
$$
$d = d(t)$ is a modified time with time before and after leap day incremented by 0.5 day so that in d the length of the year is always 365, wheter in common year or leap year. 

## TODO: add local variablity by adding
$$K_d(\cdot) = \sigma_d^2 exp({-\frac{2\sin^2 (\pi(d-d')/365)}{l_{d,1}^2}})exp({-\frac{|d-d'|^2}{l_{d,2}^2}})$$

In [240]:
data = data.assign(RATER_CODE=pd.Categorical(data.RATER).codes + 1)

In [253]:
data

Unnamed: 0,PLT_ID,TRAIT,DATE,QUALITY,ENTRY_CODE,ENTRY_NAME,COMP,RATER,ROW,COL,PLOC_CODE,YEAR,MONTH,RATING_EVENT,RATING_EVENT_CODE,MONTH_CODE,YEAR_CODE,TEST_LOC,DATE_CODE,RATER_CODE
0,241,Turf Quality,2018-04-18,4,37,NAI-14-132,Columbia River Seed,RMD,17,1,239,2018,Apr,RMD-2018-04-18,8,1,1,"Adelphia, NJ",108.0,4
1,240,Turf Quality,2018-04-18,5,25,BAR PP 7236V,Barenbrug Research,RMD,16,1,238,2018,Apr,RMD-2018-04-18,8,1,1,"Adelphia, NJ",108.0,4
2,211,Turf Quality,2018-04-18,6,59,DLFPS-340/3455,DLF Pickseed USA,RMD,15,1,209,2018,Apr,RMD-2018-04-18,8,1,1,"Adelphia, NJ",108.0,4
3,210,Turf Quality,2018-04-18,3,75,A15-6,Peak Plant Genetics,RMD,14,1,208,2018,Apr,RMD-2018-04-18,8,1,1,"Adelphia, NJ",108.0,4
4,181,Turf Quality,2018-04-18,3,31,NAI-15-80,SiteOne Landscape Supply,RMD,13,1,179,2018,Apr,RMD-2018-04-18,8,1,1,"Adelphia, NJ",108.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9607,75,Turf Quality,2021-11-18,6,75,A15-6,Peak Plant Genetics,WAM,5,15,75,2021,Nov,WAM-2021-11-18,36,7,4,"Adelphia, NJ",1417.0,7
9608,46,Turf Quality,2021-11-18,5,46,Bombay (GO-22B23),Grassland Oregon,WAM,4,15,46,2021,Nov,WAM-2021-11-18,36,7,4,"Adelphia, NJ",1417.0,7
9609,45,Turf Quality,2021-11-18,5,45,Skye,Standard,WAM,3,15,45,2021,Nov,WAM-2021-11-18,36,7,4,"Adelphia, NJ",1417.0,7
9610,16,Turf Quality,2021-11-18,7,16,J-1138,Jacklin Seed by Simplot,WAM,2,15,16,2021,Nov,WAM-2021-11-18,36,7,4,"Adelphia, NJ",1417.0,7


In [252]:
pred_dates = pd.date_range(dates_data.DATE.min(), dates_data.DATE.max(), freq='W-MON')
pred_data = dates_data[dates_data.DATE.isin(pred_dates)]
pred_data

Unnamed: 0,DATE,DATE_CODE
0,2018-01-01,1.0
7,2018-01-08,8.0
14,2018-01-15,15.0
21,2018-01-22,22.0
28,2018-01-29,29.0
...,...,...
1428,2021-11-29,1428.0
1435,2021-12-06,1435.0
1442,2021-12-13,1442.0
1449,2021-12-20,1449.0


In [244]:
model_data = util.get_model_data(data)
model_data['ii'] = np.asarray(data.RATER_CODE)
model_data['I'] = int(max(model_data['ii']))
model_data['tt'] = np.asarray(data.DATE_CODE)
model_data['T'] = int(max(np.asarray(model_data['tt'])))
model_data['period'] = 365
model_data

{'y': array([3, 4, 5, ..., 4, 6, 4]),
 'ii': array([4, 4, 4, ..., 7, 7, 7], dtype=int8),
 'jj': array([37, 25, 59, ..., 45, 16, 15]),
 'pp': array([239, 238, 209, ...,  45,  16,  15]),
 'N': 9612,
 'M': 8,
 'I': 7,
 'J': 89,
 'P': 267,
 'tt': array([ 108.,  108.,  108., ..., 1417., 1417., 1417.]),
 'T': 1417,
 'period': 365}

In [215]:
from cmdstanpy import CmdStanModel