In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from gptools.stan import compile_model
import cmdstanpy
cmdstanpy.install_cmdstan()
import nest_asyncio
nest_asyncio.apply()

In [None]:
import random
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv('../model_data/quality_nj2.csv')
encoder1, encoder2, encoder3, encoder4 = LabelEncoder(), LabelEncoder(), LabelEncoder(), LabelEncoder()
df['RATING_EVENT_CODE'] = encoder1.fit_transform(df['RATING_EVENT'])
df['ENTRY_NAME_CODE'] = encoder2.fit_transform(df['ENTRY_NAME'])
df['PLT_ID_CODE'] = encoder3.fit_transform(df['PLT_ID'])
df['RATER_CODE'] = encoder4.fit_transform(df['RATER'])
plt_coords = df.groupby('PLT_ID_CODE')[['ROW','COL']].mean()
year = pd.to_datetime(df['DATE']).dt.year
num_days_in_year = pd.to_datetime(year.astype(str), format='%Y').dt.is_leap_year * 366 + (~pd.to_datetime(year.astype(str), format='%Y').dt.is_leap_year) * 365
df['DAY_OF_YEAR'] = pd.to_datetime(df['DATE']).dt.dayofyear
df['TIME_OF_YEAR'] = df['DAY_OF_YEAR']/num_days_in_year
df['ENTRY_CUMCOUNT'] = df.groupby('ENTRY_NAME').cumcount() + 1
df.head()

In [None]:
padding = 5       # Padding for Fourier GP plot effect
num_rows = 18     # Number of rows of the turfgrass plot
num_cols = 15     # Number of columns of the turfgrass plot
pred_N = 100      # Split 1 year into pred_N points, and make a prediction of time effect for each point
num_basis_functions = 6 # Number of basis functions for time effect

stan_data_time = {"N": len(df["QUALITY"]),                 # Number of responses
             "num_raters":len(df['RATER'].unique()),         # Total number of rating events
             "num_entries":len(df['ENTRY_NAME'].unique()),           # Total number of entries (turfgrass types)
             "num_plots":len(df['PLT_ID'].unique()),               # Total number of plots 
             "num_categories": 9,                                       # Total number of rating categories
             "num_events_by_rater": df.groupby('RATER_CODE')['PLT_ID'].count(),
             "rater_id": df["RATER_CODE"].values+1,       # rating id for y[n], defined by rater + date
             "entry_id": df["ENTRY_NAME_CODE"].values+1,         # entry of y[n]
             "plot_id": df["PLT_ID_CODE"].values+1,             # plot id of y[n]
             "y": df["QUALITY"].values,                    # the rating value of y[n] that we are trying to model
            
             # values used for fourier scalable gaussian process inference, plot effect
             "num_rows": num_rows,
             "num_cols": num_cols,
             "num_rows_padded": num_rows + padding,
             "num_cols_padded": num_cols + padding,
             "plot_row" : plt_coords["ROW"].astype(int),
             "plot_col" : plt_coords["COL"].astype(int),
                  
             # values used for time GP
             "time" : df['TIME_OF_YEAR']+df['YEAR'],
             "c_f" : 1.5,
             "M_f":num_basis_functions, # number of Hilbert Basis functions
             "num_ratings_per_entry": np.max(df.groupby('ENTRY_NAME').count()['PLT_ID']),
             "entry_cumcount": df["ENTRY_CUMCOUNT"],
             
             # values for making predictions
             "pred_N": pred_N,
             "pred_time": np.linspace(0,1,pred_N+1)[1:]
              }

In [None]:
model = compile_model(stan_file="fourier_model_time_effect_distinct_entries_v8_sharedparams.stan", force_compile=True)
fit = model.sample(stan_data_time)

In [None]:
import pickle
from datetime import date

with open("model_" + datetime.now().strftime("%d/%m/%Y %H:%M:%S"), "wb") as f:
    pickle.dump({'model' : model, 'fit' : fit}, f, protocol=-1)
    # or with a list
    # pickle.dump([model, fit], f, protocol=-1)