## Functions and Data

In [1]:
import os
import datetime
import copy


import numpy as np
from numpy.random import normal as rnorm, multinomial as rmultinomial
import scipy.stats
from scipy.special import logsumexp, loggamma
from sklearn.mixture import GaussianMixture
from sklearn.metrics import r2_score, mean_squared_error, roc_auc_score
from sklearn.linear_model import LinearRegression, RidgeCV, Ridge, LogisticRegression, LogisticRegressionCV
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVR
from sklearn.model_selection import KFold

import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm


plt.style.use(['seaborn-talk'])

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)


In [2]:
import warnings
from sklearn.exceptions import DataConversionWarning, ConvergenceWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=sm.tools.sm_exceptions.DomainWarning)

In [3]:
data_W = pd.read_excel(r"data/Hourly Weather and No Outlier Wachapreague Data.xlsx", parse_dates=[r"Date"])
data_W = data_W.drop(
    columns=["Wind Direction (degrees)", "Water Level Anomalies (m)"]
    ).rename(
    columns = {"Date": "time_min", 
                "Wind Speed (m/s)": "wind",
               "Air Pressure (mb)": "airpressure",
               "Precipitation (mm/hr)": "precipitation", 
               "Temperature ©": "temperature",
               "Salinity (ppt)": "salinity",
               "DO (mg/L)": "ODO",
               "Water Level (m)": "waterlevel",
               "Log10(Chl+1) (log10(ug/L))": "log10_chlorophyll",
              }
    ).set_index("time_min")

data_W["chlorophyll"] = data_W["log10_chlorophyll"].map(lambda x: np.power(10, x))
data_W["date"] = data_W.index.date
#data_W = data_W.dropna()

# data_W = pd.read_excel(r"data/Corrected W All.xlsx", sheet_name = "Sheet1", parse_dates=[r"Combine"],)
# data_W = data_W.drop(
#     columns=["MM/DD/YY", "HH:mm:SS", "pH (mv)", "ODO (%sat)", "BGA-PE (ug/L)", "Battery (volts)", "Sonde SN", "Unnamed: 15"]
#     ).rename(
#     columns = {"Combine": "time_min",
#                "Temp ('C)": "temperature",
#                "SpCond (ms/cm)": "conductivity",
#                "Salinity (ppt)": "salinity",
#                "ODO (mg/L)": "ODO",
#                "Turb (NTU)": "turbidity",
#                "Chl (ug/L)": "chlorophyll",
#               }
#     ).set_index("time_min")
# data_W["date"] = data_W.index.date
# data_W["log10_chlorophyll"] = data_W["chlorophyll"].map(np.log10)
# data_W = data_W.dropna()


In [4]:
data_W

Unnamed: 0_level_0,wind,airpressure,precipitation,temperature,salinity,ODO,log10_chlorophyll,waterlevel,chlorophyll,date
time_min,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-03-25 20:00:00,1.870,1012.210000,0.76,16.70975,30.9550,8.0425,0.360495,0.938,2.293480,2016-03-25
2016-03-25 21:00:00,2.500,1013.730000,0.53,16.43975,31.2575,7.8200,0.319070,1.127,2.084826,2016-03-25
2016-03-25 22:00:00,3.690,1014.980000,0.20,15.83275,31.4425,7.8650,0.327645,1.268,2.126401,2016-03-25
2016-03-25 23:00:00,4.180,1015.490000,0.03,15.43950,31.4775,7.8525,0.330899,1.324,2.142390,2016-03-25
2016-03-26 00:00:00,3.150,1016.750000,0.00,15.45900,31.3925,7.7400,0.325592,1.238,2.116372,2016-03-26
...,...,...,...,...,...,...,...,...,...,...
2022-12-31 18:00:00,0.625,1011.325000,0.02,7.93175,30.2575,10.6000,0.358752,0.8,2.284292,2022-12-31
2022-12-31 19:00:00,1.800,1011.310000,,8.10550,30.1100,10.5450,0.359835,,2.289997,2022-12-31
2022-12-31 20:00:00,2.120,1010.420000,,8.65100,28.9200,10.2250,0.429598,,2.689042,2022-12-31
2022-12-31 21:00:00,2.850,1010.810000,,8.67150,28.6850,10.1925,0.454817,,2.849817,2022-12-31


In [5]:

data_WW = pd.read_excel(r"data/VIMS WQ Data.xlsx", sheet_name = "WW", skiprows = 7, na_values = {"pH": [0.0]}, parse_dates=[r"Date/Time Combined", r"MM/DD/YY"], )
data_WW_2 = pd.read_excel(r"data/2022-Willis_Wharf ALL Raw (1).xlsx", skiprows = 7, na_values = {"pH": [0.0]}, parse_dates=[r"Date/Time Combined", r"MM/DD/YY"], )

data_WW = data_WW.drop(
    columns=["Unnamed: 17", "Unnamed: 18", "MM/DD/YY.1", "Flagged or Deleted Data/Notes", "Log10 chl", 
             "pH (mv)", "ODO (%sat)", "BGA-PE (ug/L)", "Battery (volts)", "Sonde SN", "HH:mm:SS", "TSS (mg/L)", "Day"],
    ).rename(
    columns = {"MM/DD/YY": "date", 
               "Date/Time Combined": "time_min",
               "Temp ('C)": "temperature",
               "SpCond (ms/cm)": "conductivity",
               "Salinity (ppt)": "salinity",
               "ODO (mg/L)": "ODO",
               "Turb (NTU)": "turbidity",
               "Chl (ug/L)": "chlorophyll",
              }
    ).set_index("time_min")

data_WW_2 = data_WW_2.drop(
    columns=["Unnamed: 16", "Unnamed: 17", "Unnamed: 18", "Unnamed: 19", "Unnamed: 20", 
             "pH (mv)", "ODO (%sat)", "BGA-PE (ug/L)", "Battery (volts)", "Sonde SN", "HH:mm:SS", "TSS (mg/L)"],
    ).rename(
    columns = {"MM/DD/YY": "date", 
               "Date/Time Combined": "time_min",
               "Temp ('C)": "temperature",
               "SpCond (ms/cm)": "conductivity",
               "Salinity (ppt)": "salinity",
               "ODO (mg/L)": "ODO",
               "Turb (NTU)": "turbidity",
               "Chl (ug/L)": "chlorophyll",
              }
    ).set_index("time_min")

data_WW = pd.concat([data_WW[data_WW.index.year < 2022], data_WW_2])
data_WW["log10_chlorophyll"] = data_WW["chlorophyll"].map(np.log10)
#data_WW = data_WW.dropna()


  new_values = map_f(values, mapper)
  new_values = map_f(values, mapper)


In [6]:
data_WW

Unnamed: 0_level_0,date,temperature,conductivity,salinity,pH,ODO,turbidity,chlorophyll,log10_chlorophyll
time_min,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-10-12 12:53:04,2018-10-12,24.221,46.978,30.55,7.81,5.84,21.72,3.87,0.587711
2018-10-12 14:03:38,2018-10-12,24.615,46.755,30.38,7.86,6.37,19.17,6.34,0.802089
2018-10-12 14:17:40,2018-10-12,24.757,46.513,30.20,7.85,6.54,20.72,6.97,0.843233
2018-10-12 14:32:41,2018-10-12,24.898,46.381,30.10,7.85,6.71,21.37,7.67,0.884795
2018-10-12 14:47:41,2018-10-12,24.940,46.315,30.05,7.86,6.81,23.71,7.33,0.865104
...,...,...,...,...,...,...,...,...,...
2022-12-24 05:02:12,2022-12-24,4.105,46.071,29.17,7.84,9.80,17.23,3.00,0.477121
2022-12-24 05:17:12,2022-12-24,3.908,46.023,29.11,7.84,9.79,17.85,3.09,0.489958
2022-12-24 05:32:12,2022-12-24,3.921,46.073,29.15,7.83,9.72,21.78,3.06,0.485721
2022-12-24 05:47:12,2022-12-24,4.077,45.867,29.02,7.81,9.41,42.95,3.61,0.557507


In [7]:
def matmul_log(A, log_b):
    res = []
    for tmp_line in A:
        if np.any(tmp_line == 0.) :
            if np.all(tmp_line == 0.) :
                res.append(logsumexp(np.log([1 / len(tmp_line) for i in range(len(tmp_line))]) + log_b))
            else:
                res.append(logsumexp(np.log(tmp_line[tmp_line != 0]) + log_b[tmp_line != 0]))
        else:
            res.append(logsumexp(np.log(tmp_line) + log_b))
    return np.array(res)


In [25]:
class MSLR():
    
    def __init__(self, n_components = 2, covariance_type="full", n_iter = 10, reg_method = "OLS", kargs_reg = None):
        self.n_components = n_components
        self.n_iter = n_iter
        self.covariance_type = covariance_type 
        self.tol = 0.0
        
        if kargs_reg is None:
            kargs_reg = dict()
        self.kargs_reg = kargs_reg
            
        if reg_method in {"OLS", "LR", "LinearRegression", }:
            self.reg_method = LinearRegression
        elif reg_method in {"Ridge", }:
            self.reg_method = Ridge
        elif reg_method in {"RidgeCV", }:
            self.reg_method = RidgeCV
        elif reg_method in {"LinearSVR", "LinearSVM", }:
            self.reg_method = LinearSVR
        else:
            self.reg_method = reg_method
        
        return
    
    
    def fit_predict(self, X, Y, X_test, is_multiple_sequence = False, forecast_horizon = 1):
        
        #matmul_log = lambda A, log_b: np.array([logsumexp(np.log(tmp_line) + log_b) for tmp_line in A])
        
        if is_multiple_sequence:
            list_X, list_Y = [np.array(i) for i in X], [np.array(i) for i in Y]
            list_X_test = [np.array(i) for i in X_test] if X_test is not None else None
        else:
            list_X, list_Y = [np.array(X)], [np.array(Y)]
            list_X_test = [np.array(X_test)] if X_test is not None else None
        
        marginal_X, marginal_Y = np.concatenate(list_X), np.concatenate(list_Y)
        gmm = GaussianMixture(n_components = self.n_components, covariance_type = self.covariance_type, random_state=434)
        gmm.fit(marginal_Y)
        
        n_seq = len(list_X)
        list_T = [len(i) for i in list_X]
        
        p_X, p = len(list_X[0][0]), len(list_Y[0][0])
        K = self.n_components
        self.n_features = p
        
        self.transmat_ = np.array([[1. / K for j in range(K)] for i in range(K)])

        self.list_loglik_ = []
        for epoch in range(self.n_iter):

            # M-step
            if epoch != 0:
                tmp_weight_mat = []
                for cur_log_forward_prob, cur_log_backward_prob in zip(list_cur_log_forward_prob, list_cur_log_backward_prob):
                    tmp_log_weight_mat = cur_log_forward_prob + cur_log_backward_prob
                    tmp_log_weight_mat -= logsumexp(tmp_log_weight_mat[-1, :])
                    tmp_weight_mat.append(np.exp(tmp_log_weight_mat))
                tmp_weight_mat = np.concatenate(tmp_weight_mat)
            else:
                tmp_weight_mat = gmm.predict_proba(marginal_Y)
                
            list_lr_cov = []
            for ii in range(K):
                tmp_weight = tmp_weight_mat[:, ii]
                tmp_lr = self.reg_method(**self.kargs_reg)
                tmp_lr.fit(marginal_X, marginal_Y, sample_weight = tmp_weight)
                tmp_resid = marginal_Y - tmp_lr.predict(marginal_X)
                if self.covariance_type == "full":
                    tmp_cov_sum = np.zeros(shape = (p, p))
                    for jj in range(len(marginal_X)):
                        tmp_x = tmp_resid[jj, :]
                        tmp_cov_sum += np.outer(tmp_x, tmp_x) * tmp_weight[jj]
                    tmp_cov = tmp_cov_sum / np.sum(tmp_weight) 
                elif self.covariance_type == "diag":
                    tmp_cov = np.array([max(np.sum(tmp_weight * (tmp_resid[:, j] ** 2)) / np.sum(tmp_weight), 1e-12) for j in range(p)])
                list_lr_cov.append((tmp_lr, tmp_cov))
            self.list_lr_cov = list_lr_cov
            
            list_cur_mat_log_b = self._calc_emission_mat(list_X, list_Y)
            list_list_log_emission = [[np.copy(list_cur_mat_log_b[index_X][i, :]) for i in range(T)] for index_X, T in enumerate(list_T)]
                        
            if epoch != 0:
                cur_log_initprob = [list_cur_log_forward_prob[i][0] + list_cur_log_backward_prob[i][0] for i in range(n_seq)]
            else:
                cur_log_initprob = [[np.log(i) if i != 0 else -100 for i in line] for line in gmm.predict_proba([Y[0] for Y in list_Y])]
            
            cur_log_initprob = np.sum(cur_log_initprob, axis = 0)
            cur_log_initprob -= logsumexp(cur_log_initprob)
            self.startprob_ = np.exp(cur_log_initprob)
            self.log_startprob_ = cur_log_initprob

            # E-step      
            list_cur_log_forward_prob = [[] for i in range(n_seq)]
            for index_X, X in enumerate(list_X):
                for tt in range(len(X)):
                    if tt == 0:
                        tmp_log_prob = cur_log_initprob + list_cur_mat_log_b[index_X][0, :]
                    else:
                        tmp_log_prob = matmul_log(np.transpose(self.transmat_), list_cur_log_forward_prob[index_X][-1]) + list_cur_mat_log_b[index_X][tt, :]
                    list_cur_log_forward_prob[index_X].append(tmp_log_prob)
            list_cur_log_forward_prob = [np.array(i) for i in list_cur_log_forward_prob]
            
            list_cur_log_backward_prob = [[0 for i in range(T)] for T in list_T]
            for index_X, (X, T) in enumerate(zip(list_X, list_T)):
                for tt in range(T - 1, -1, -1):
                    if tt == T - 1:
                        tmp_log_prob = np.array([np.log(1.) for i in range(K)])
                    else:
                        tmp_log_prob = []
                        for ii in range(K):
                            tmp_log_prob.append(logsumexp([np.log(self.transmat_[ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
                        tmp_log_prob = np.array(tmp_log_prob)
                    list_cur_log_backward_prob[index_X][tt] = tmp_log_prob
            list_cur_log_backward_prob = [np.array(i) for i in list_cur_log_backward_prob]
            
            list_tmp_array2d_log_gamma = [i + j for (i, j) in zip(list_cur_log_forward_prob, list_cur_log_backward_prob)]
            list_tmp_array1d_log_gamma_sum = [[logsumexp(i[t, :]) for t in range(len(i))] for i in list_tmp_array2d_log_gamma]
            
            list_tmp_array3d_log_epsilon = [np.zeros(shape = (T - 1, K, K)) for T in list_T]
            for index_X, (X, T) in enumerate(zip(list_X, list_T)):
                for tt in range(T - 1):
                    for ii in range(K):
                        for jj in range(K):
                            list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(self.transmat_[ii, jj]) + \
                                    list_cur_log_backward_prob[index_X][tt + 1, jj] + list_cur_mat_log_b[index_X][tt + 1, jj] - list_tmp_array1d_log_gamma_sum[index_X][tt]            
        
            self.transmat_ = self._calc_transmat(list_tmp_array3d_log_epsilon)
            
            loglik = np.sum([logsumexp(i[-1]) for i in list_cur_log_forward_prob])
            self.list_loglik_.append(loglik)
            
            if (len(self.list_loglik_) >= 2) and (abs(self.list_loglik_[-1] - self.list_loglik_[-2]) < self.tol):
                break
            
            #print("epoch % s, loglik = % s" % (epoch, loglik))
            #print(cur_list_mvn_mean_cov)
            #print(cur_transmat)
        
        self.list_last_posterior = [i[-1] + j[-1] for (i, j) in zip(list_cur_log_backward_prob, list_cur_log_forward_prob)]
        for ii in range(n_seq):
            tmp = self.list_last_posterior[ii]
            tmp -= np.max(tmp)
            tmp = np.exp(tmp)
            tmp /= np.sum(tmp)
            self.list_last_posterior[ii] = tmp
        
        if list_X_test is None:
            pred = None
        else:
            pred = []
            for ii in range(n_seq):
                tmp_pred_prob = self.list_last_posterior[ii] @ np.linalg.matrix_power(self.transmat_, forecast_horizon)
                tmp_pred = 0
                for tmp_k in range(K):
                    tmp_pred += list_lr_cov[tmp_k][0].predict([list_X_test[ii]])[0] * tmp_pred_prob[tmp_k]
                pred.append(tmp_pred)

            if not is_multiple_sequence:
                pred = pred[0]
        
        return pred
    
    
    def fit(self, X, Y, is_multiple_sequence = False):
        self.fit_predict(X = X, Y = Y, X_test = None, is_multiple_sequence = is_multiple_sequence)
        return
    
    
    def _calc_transmat(self, list_tmp_array3d_log_epsilon):
        K = self.n_components
        cur_transmat = np.zeros((K, K)) 
        for ii in range(K):
            tmp_list_log_prob = []
            for jj in range(K):
                tmp_log_prob = []
                for index_X in range(len(list_tmp_array3d_log_epsilon)):
                    tmp_log_prob.append(logsumexp(list_tmp_array3d_log_epsilon[index_X][:, ii, jj]))
                tmp_list_log_prob.append(logsumexp(tmp_log_prob))
            tmp_list_log_prob = np.array(tmp_list_log_prob)
            tmp_list_log_prob -= np.max(tmp_list_log_prob)
            tmp_prob = np.exp(tmp_list_log_prob)
            tmp_prob /= sum(tmp_prob)
            cur_transmat[ii, :] = tmp_prob
        return cur_transmat
    
    
    def _calc_emission_mat(self, list_X, list_Y):
        
        list_T = [len(i) for i in list_X]
        p, K = self.n_features, self.n_components
        
        list_cur_mat_log_b = [np.zeros(shape = (i, K)) for i in list_T]
        for ii, (tmp_lr, tmp_cov) in enumerate(self.list_lr_cov):
            for index_X, (X, Y) in enumerate(zip(list_X, list_Y)):
                tmp_resid = Y - tmp_lr.predict(X)
                if self.covariance_type == "full":
                    list_cur_mat_log_b[index_X][:, ii] = scipy.stats.multivariate_normal.logpdf(tmp_resid, mean=[0] * p, cov=tmp_cov)
                elif self.covariance_type == "diag":
                    tmp_logprob = 0
                    for jj in range(p):
                        tmp_logprob += scipy.stats.norm.logpdf(tmp_resid[:, jj], loc = 0, scale = np.sqrt(tmp_cov[jj]))
                    list_cur_mat_log_b[index_X][:, ii] = tmp_logprob

        return list_cur_mat_log_b
    
    
# def _test_MSLR():
    
#     list_beta = [np.array([[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]), 
#                  np.array([[0.1, 0.5], [0.8, 0.5], [0.2, 0.6]]), 
#                  np.array([[-0.5, 0.7], [-0.5, -0.8], [-0.5, -0.2]]),]
#     list_intercept = [np.array([1., 2.]), np.array([8., 2.]), np.array([5., 3.])]
#     list_cov = [np.array([[1., 0.1], [0.1, 1.]]), np.array([[1.5, 0.1], [0.1, 1.5]]), np.array([[3., 0.1], [0.1, 3.]])]
#     transmat = np.array([[0.9, 0.05,  0.05], [ 0.3, 0.6,  0.1], [ 0.3,  0.2, 0.5]])
#     init_prob = np.array([1 / 3, 1 / 2, 1 / 6])
#     T = 10000
    
#     list_X, list_Y = [], []
#     list_X_test = []
#     for ii in range(5):
        
#         X = np.random.normal(loc=1.0, scale=5.0, size=(T, 3))
#         X_test = np.random.normal(loc=1.0, scale=5.0, size=(1, 3))
#         Y, list_h_label = [], []
#         for tt in range(T):
#             if tt == 0:
#                 tmp_h_label = np.argmax(np.random.multinomial(n = 1, pvals = init_prob, size = 1))
#             else:
#                 tmp_h_label = np.argmax(np.random.multinomial(n = 1, pvals = transmat[tmp_h_label], size = 1))
#             list_h_label.append(tmp_h_label)
#             tmp_Y = X[tt, :] @ list_beta[tmp_h_label] + list_intercept[tmp_h_label] + np.random.multivariate_normal(mean = [0, 0], cov = list_cov[tmp_h_label])
#             Y.append(tmp_Y)
#         Y = np.array(Y)
        
#         list_X.append(X)
#         list_Y.append(Y)
#         list_X_test.append(X_test)
        
#     mslr = MSLR(n_components=3, covariance_type="full", n_iter=10)
#     pred = mslr.fit_predict(list_X, list_Y, list_X_test, is_multiple_sequence=True)
#     print(pred)
#     print(mslr.transmat_)
#     print(mslr.startprob_)
#     for tmp_lr, tmp_cov in mslr.list_lr_cov:
#         print(tmp_lr.coef_)
#         print(tmp_lr.intercept_)
#         print(tmp_cov)

#     return


# _test_MSLR()


In [26]:
class MSLRX():
    
    def __init__(self, n_components = 2, covariance_type="full", n_iter = 10, reg_method = "OLS", kargs_reg = None, 
                 is_logistic_regression_CV = False, logistic_regression_C = 1e10, is_logistic_regression_standardized = False,
                is_state_coef_indep = False, is_ordinal_logit = False, tol_loglik = 0.):
        
        self.n_components = n_components
        self.covariance_type = covariance_type 
        self.n_iter = n_iter
        self.min_iter = 3
        self.is_logistic_regression_CV = is_logistic_regression_CV
        self.logistic_regression_C = logistic_regression_C
        self.is_state_coef_indep = is_state_coef_indep
        self.is_ordinal_logit = is_ordinal_logit    # only valid when the response is 1-dim so we could sort it
        self.tol_loglik = tol_loglik
        self.is_print_loglik = False
        
        if kargs_reg is None:
            kargs_reg = dict()
        self.kargs_reg = kargs_reg
            
        if reg_method in {"OLS", "LR", "LinearRegression", }:
            self.reg_method = LinearRegression
        elif reg_method in {"Ridge", }:
            self.reg_method = Ridge
        elif reg_method in {"RidgeCV", }:
            self.reg_method = RidgeCV
        elif reg_method in {"LinearSVR", "LinearSVM", }:
            self.reg_method = LinearSVR
        else:
            self.reg_method = reg_method
        
        self.is_logistic_regression_standardized = is_logistic_regression_standardized
        if is_logistic_regression_standardized:
            self.standardizer = StandardScaler()
            
        return
    
    
    def _est_trans_mat(self, list_exog):
        
        list_list_trans_mat = []
        for exog in list_exog:
            T, K = len(exog), self.n_components
            list_trans_mat = []
            for tmp_exog in exog:
                tmp_trans_mat = []
                for ii in range(K):
                    if self.is_state_coef_indep or self.is_ordinal_logit:
                        if self.is_initialized:
                            
                            ii_index = self.list_sorted_index[ii] if self.is_ordinal_logit else ii
                            tmp_new_exog = [int(i == ii_index) for i in range(K)]
                            tmp_new_exog.extend(tmp_exog)
                            tmp_prob = self.logit_clf.predict_proba(np.array([tmp_new_exog]))[0]
                            if self.is_ordinal_logit:
                                tmp_prob = tmp_prob[self.list_sorted_index_to_original_index]
                            
                        else:
                            tmp_prob = [1 / K for i in range(K)]
                    else:
                        tmp_log_prob = tmp_exog @ self.list_coef_[ii] + self.list_intercept_[ii]
                        tmp_log_prob -= np.max(tmp_log_prob)
                        tmp_prob = np.exp(tmp_log_prob)
                        tmp_prob /= np.sum(tmp_prob)
                    tmp_trans_mat.append(tmp_prob)
                list_trans_mat.append(tmp_trans_mat)
            list_list_trans_mat.append(np.array(list_trans_mat))
            
        return list_list_trans_mat
    
    
    def _update_coef(self, list_array3d_log_epsilon, list_exog):
        
        if self.is_ordinal_logit:
            self.list_sorted_index = np.argsort([i[0] for i, j in self.list_mvn_mean_cov])
            self.list_sorted_index_to_original_index = [i for i, j in sorted(list(enumerate(self.list_sorted_index)), key = lambda x: x[1])]
            #print(self.list_mvn_mean_cov)
            #print([i[0] for i, j in self.list_mvn_mean_cov])
            #print(self.list_sorted_index, self.list_sorted_index_to_original_index)
        
        if self.is_state_coef_indep or self.is_ordinal_logit:
            K = self.n_components
            if self.is_ordinal_logit:
                logit_clf = mord.LogisticAT(alpha = 1 / self.logistic_regression_C, verbose = 0, max_iter = 100)
            else:
                if self.is_logistic_regression_CV:
                    logit_clf = LogisticRegressionCV(multi_class = "multinomial", max_iter = 100)
                else:
                    logit_clf = LogisticRegression(C = self.logistic_regression_C, multi_class = "multinomial", max_iter = 100)
                
            tmp_X, tmp_y, tmp_weight = [], [], []
            tmp_X_nan, tmp_y_nan, tmp_weight_nan = [], [], []
            
            for ii in range(K):
                for array3d_log_epsilon, exog in zip(list_array3d_log_epsilon, list_exog):
                    T = len(array3d_log_epsilon)
                    for tt in range(T):
                        for jj in range(K):
                            tmp_weight_cell = np.exp(array3d_log_epsilon[tt, ii, jj])
                            
                            ii_index = self.list_sorted_index[ii] if self.is_ordinal_logit else ii
                            jj_index = self.list_sorted_index[jj] if self.is_ordinal_logit else jj
                            
                            tmp_exog = [int(i == ii_index) for i in range(K)]
                            tmp_exog.extend(exog[tt])
                            
                            if np.isnan(tmp_weight_cell) or np.isinf(tmp_weight_cell):
                                tmp_X_nan.append(tmp_exog)
                                tmp_y_nan.append(jj_index)
                                tmp_weight_nan.append(tmp_weight_cell)
                            else:
                                tmp_X.append(tmp_exog)
                                tmp_y.append(jj_index)
                                tmp_weight.append(tmp_weight_cell)
            
            if len(tmp_y_nan) >= 1: 
                print("Warning: encountering % s nan in weight in ghmm_exog._update_coef" % len(tmp_y_nan))
                if len(set(tmp_y)) < K:
                    tmp_X.extend(tmp_X_nan)
                    tmp_y.extend(tmp_y_nan)
                    tmp_weight.extend([1 / K for i in range(len(tmp_weight_nan))])

            logit_clf.fit(np.array(tmp_X), np.array(tmp_y), np.array(tmp_weight)) 
            self.logit_clf = logit_clf
            
        else:
            K = self.n_components
            self.list_coef_, self.list_intercept_ = [], []
            for ii in range(K):
                if self.is_logistic_regression_CV:
                    logit_clf = LogisticRegressionCV(multi_class = "multinomial", max_iter = 100)
                else:
                    logit_clf = LogisticRegression(C = self.logistic_regression_C, multi_class = "multinomial", max_iter = 100)
                tmp_X, tmp_y, tmp_weight = [], [], []
                tmp_X_nan, tmp_y_nan, tmp_weight_nan = [], [], []
                for array3d_log_epsilon, exog in zip(list_array3d_log_epsilon, list_exog):
                    T = len(array3d_log_epsilon)
                    for tt in range(T):
                        for jj in range(K):
                            tmp_weight_cell = np.exp(array3d_log_epsilon[tt, ii, jj])
                            if np.isnan(tmp_weight_cell) or np.isinf(tmp_weight_cell):
                                tmp_X_nan.append(exog[tt])
                                tmp_y_nan.append(jj)
                                tmp_weight_nan.append(tmp_weight_cell)
                            else:
                                tmp_X.append(exog[tt])
                                tmp_y.append(jj)
                                tmp_weight.append(tmp_weight_cell)

                if len(tmp_y_nan) >= 1: 
                    print("Warning: encountering % s nan in weight in ghmm_exog._update_coef" % len(tmp_y_nan))
                    if len(set(tmp_y)) < K:
                        tmp_X, tmp_y, tmp_weight = np.array(tmp_X_nan), np.array(tmp_y_nan), np.array(tmp_weight_nan)
                        tmp_weight = 1

                logit_clf.fit(tmp_X, tmp_y, tmp_weight)

                if self.n_components == 2:
                    tmp_coef = np.array([- logit_clf.coef_.flatten(), logit_clf.coef_.flatten()])
                    tmp_intercept = np.array([- logit_clf.intercept_[0], logit_clf.intercept_[0]])
                else:
                    tmp_coef = logit_clf.coef_[np.argsort(logit_clf.classes_), :]
                    tmp_intercept = logit_clf.intercept_[np.argsort(logit_clf.classes_)]
                self.list_coef_.append(np.transpose(tmp_coef))
                self.list_intercept_.append(tmp_intercept)
                
        return
    
    
    def fit_predict(self, X, Y, exog, X_test, is_multiple_sequence = False, exog_additional = None):
        
        #matmul_log = lambda A, log_b: np.array([logsumexp(np.log(tmp_line) + log_b) for tmp_line in A])
        
        self.is_fitted = True
        self.is_initialized = False
        
        if is_multiple_sequence:
            list_X, list_Y = [np.array(i) for i in X], [np.array(i) for i in Y]
            list_X_test = [np.array(i) for i in X_test] if X_test is not None else None
            list_exog = [np.array(i) for i in exog]
            list_exog_additional = [np.array(i) for i in exog_additional] if exog_additional is not None else None
        else:
            list_X, list_Y = [np.array(X)], [np.array(Y)]
            list_X_test = [np.array(X_test)] if X_test is not None else None
            list_exog = [np.array(exog)]
            list_exog_additional = [np.array(exog_additional)] if exog_additional is not None else None
            
        if self.is_logistic_regression_standardized:
            self.standardizer.fit(np.concatenate(list_exog))
            list_exog = [self.standardizer.transform(i, copy=True) for i in list_exog]
        
        n_seq = len(list_Y)
        list_T = [len(i) for i in list_Y]
        
        p_X, p = len(list_X[0][0]), len(list_Y[0][0])
        p_exog = len(list_exog[0][0])
        K = self.n_components
        self.n_features = p
        
        marginal_X, marginal_Y = np.concatenate(list_X), np.concatenate(list_Y)
        gmm = GaussianMixture(n_components = self.n_components, covariance_type = self.covariance_type, random_state=434)
        gmm.fit(marginal_Y)
        
        self.list_coef_ = [np.zeros(shape = (p_exog, K)) for i in range(K)]
        self.list_intercept_ = [np.zeros(shape = K) for i in range(K)]
        list_cur_list_trans_mat = self._est_trans_mat(list_exog)

        self.list_loglik_ = []
        for epoch in range(self.n_iter):

            # M-step
            if epoch != 0:
                tmp_weight_mat = []
                for cur_log_forward_prob, cur_log_backward_prob in zip(list_cur_log_forward_prob, list_cur_log_backward_prob):
                    tmp_log_weight_mat = cur_log_forward_prob + cur_log_backward_prob
                    tmp_log_weight_mat -= logsumexp(tmp_log_weight_mat[-1, :])
                    tmp_weight_mat.append(np.exp(tmp_log_weight_mat))
                tmp_weight_mat = np.concatenate(tmp_weight_mat)
            else:
                tmp_weight_mat = gmm.predict_proba(marginal_Y)
                
            list_lr_cov = []
            for ii in range(K):
                tmp_weight = tmp_weight_mat[:, ii]
                tmp_lr = self.reg_method(**self.kargs_reg)
                tmp_lr.fit(marginal_X, marginal_Y, sample_weight = tmp_weight)
                tmp_resid = marginal_Y - tmp_lr.predict(marginal_X)
                if self.covariance_type == "full":
                    tmp_cov_sum = np.zeros(shape = (p, p))
                    for jj in range(len(marginal_X)):
                        tmp_x = tmp_resid[jj, :]
                        tmp_cov_sum += np.outer(tmp_x, tmp_x) * tmp_weight[jj]
                    tmp_cov = tmp_cov_sum / np.sum(tmp_weight) 
                elif self.covariance_type == "diag":
                    tmp_cov = np.array([max(np.sum(tmp_weight * (tmp_resid[:, j] ** 2)) / np.sum(tmp_weight), 1e-12) for j in range(p)])
                list_lr_cov.append((tmp_lr, tmp_cov))
            self.list_lr_cov = list_lr_cov
            
            list_cur_mat_log_b = self._calc_emission_mat(list_X, list_Y)
                        
            if epoch != 0:
                cur_log_initprob = [list_cur_log_forward_prob[i][0] + list_cur_log_backward_prob[i][0] for i in range(n_seq)]
            else:
                cur_log_initprob = [[np.log(i) if i != 0 else -100 for i in line] for line in gmm.predict_proba([Y[0] for Y in list_Y])]
            
            cur_log_initprob = np.sum(cur_log_initprob, axis = 0)
            cur_log_initprob -= logsumexp(cur_log_initprob)
            self.startprob_ = np.exp(cur_log_initprob)
            self.log_startprob_ = cur_log_initprob

            # E-step      
            list_cur_log_forward_prob = [[] for i in range(n_seq)]
            for index_X, X in enumerate(list_X):
                for tt in range(len(X)):
                    if tt == 0:
                        tmp_log_prob = cur_log_initprob + list_cur_mat_log_b[index_X][0, :]
                    else:
                        tmp_log_prob = matmul_log(np.transpose(list_cur_list_trans_mat[index_X][tt - 1]), list_cur_log_forward_prob[index_X][-1]) + list_cur_mat_log_b[index_X][tt, :]
                    list_cur_log_forward_prob[index_X].append(tmp_log_prob)
            list_cur_log_forward_prob = [np.array(i) for i in list_cur_log_forward_prob]
            
            list_cur_log_backward_prob = [[0 for i in range(T)] for T in list_T]
            for index_X, (X, T) in enumerate(zip(list_X, list_T)):
                for tt in range(T - 1, -1, -1):
                    if tt == T - 1:
                        tmp_log_prob = np.array([np.log(1.) for i in range(K)])
                    else:
                        tmp_log_prob = []
                        for ii in range(K):
                            tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
                        tmp_log_prob = np.array(tmp_log_prob)
                    list_cur_log_backward_prob[index_X][tt] = tmp_log_prob
            list_cur_log_backward_prob = [np.array(i) for i in list_cur_log_backward_prob]
            
            list_tmp_array2d_log_gamma = [i + j for (i, j) in zip(list_cur_log_forward_prob, list_cur_log_backward_prob)]
            list_tmp_array1d_log_gamma_sum = [[logsumexp(i[t, :]) for t in range(len(i))] for i in list_tmp_array2d_log_gamma]
            
            list_tmp_array3d_log_epsilon = [np.zeros(shape = (T - 1, K, K)) for T in list_T]
            for index_X, (X, T) in enumerate(zip(list_X, list_T)):
                for tt in range(T - 1):
                    for ii in range(K):
                        for jj in range(K):
                            list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
                                    list_cur_log_backward_prob[index_X][tt + 1, jj] + list_cur_mat_log_b[index_X][tt + 1, jj] - list_tmp_array1d_log_gamma_sum[index_X][tt]            
        
            self._update_coef(list_tmp_array3d_log_epsilon, list_exog) # update coef and intercept for transmat
            list_cur_list_trans_mat = self._est_trans_mat(list_exog)
            self.list_cur_list_trans_mat = list_cur_list_trans_mat
            
            loglik = np.sum([logsumexp(i[-1]) for i in list_cur_log_forward_prob])
            self.list_loglik_.append(loglik)
            
            self.is_initialized = True
            
            if self.is_print_loglik:
                print("epoch % s, loglik = % s" % (epoch, loglik))
            
            if (len(self.list_loglik_) >= 2) and (abs(self.list_loglik_[-1] - self.list_loglik_[-2]) < self.tol_loglik):
                break
        
        self.list_last_posterior = [i[-1] + j[-1] for (i, j) in zip(list_cur_log_backward_prob, list_cur_log_forward_prob)]
        self.list_log_pred_posterior = []
            
        list_transmat_additional = self._est_trans_mat(list_exog_additional) if list_exog_additional is not None else None
        
        for ii in range(n_seq):
            tmp = self.list_last_posterior[ii]
            tmp -= logsumexp(tmp)
            self.list_last_posterior[ii] = np.exp(tmp)
            tmp_transmat = list_cur_list_trans_mat[ii][-1]
            if list_transmat_additional is not None:
                for tmp_transmat_additional in list_transmat_additional[ii]:
                    tmp_transmat = tmp_transmat @ tmp_transmat_additional
            self.list_log_pred_posterior.append(matmul_log(np.transpose(tmp_transmat), tmp))
            
        if list_X_test is None:
            pred = None
        else:
            pred = []
            for ii in range(n_seq):
                tmp_pred_prob = np.exp(self.list_log_pred_posterior[ii])
                tmp_pred = 0
                for tmp_k in range(K):
                    tmp_pred += list_lr_cov[tmp_k][0].predict([list_X_test[ii]])[0] * tmp_pred_prob[tmp_k]
                pred.append(tmp_pred)

            if not is_multiple_sequence:
                pred = pred[0]
        
        return pred
    
    
    def fit(self, X, Y, exog, is_multiple_sequence = False):
        self.fit_predict(X = X, Y = Y, exog = exog, X_test = None, is_multiple_sequence = is_multiple_sequence)
        return
    
    
    def _calc_emission_mat(self, list_X, list_Y):
        
        list_T = [len(i) for i in list_X]
        p, K = self.n_features, self.n_components
        
        list_cur_mat_log_b = [np.zeros(shape = (i, K)) for i in list_T]
        for ii, (tmp_lr, tmp_cov) in enumerate(self.list_lr_cov):
            for index_X, (X, Y) in enumerate(zip(list_X, list_Y)):
                tmp_resid = Y - tmp_lr.predict(X)
                if self.covariance_type == "full":
                    list_cur_mat_log_b[index_X][:, ii] = scipy.stats.multivariate_normal.logpdf(tmp_resid, mean=[0] * p, cov=tmp_cov)
                elif self.covariance_type == "diag":
                    tmp_logprob = 0
                    for jj in range(p):
                        tmp_logprob += scipy.stats.norm.logpdf(tmp_resid[:, jj], loc = 0, scale = np.sqrt(tmp_cov[jj]))
                    list_cur_mat_log_b[index_X][:, ii] = tmp_logprob

        return list_cur_mat_log_b
    
    
    def online_predict(self, X_test, Y_test, exog_test, is_multiple_sequence = False):
        
        if not self.is_fitted:
            print("Error: Not fitted. MSLRX's online_predict method must be used AFTER fit")
            return
            
        #matmul_log = lambda A, log_b: np.array([logsumexp(np.log(tmp_line) + log_b) for tmp_line in A])
        
        if is_multiple_sequence:
            list_X_test, list_Y_test = [np.array(i) for i in X_test], [np.array(i) for i in Y_test]  
            list_exog_test = [np.array(i) for i in exog_test] 
        else:
            list_X_test, list_Y_test = [np.array(X_test)], [np.array(Y_test)] 
            list_exog_test = [np.array(exog_test)] 

        list_T = [len(i) for i in list_Y_test]
        K = self.n_components
            
        if self.is_logistic_regression_standardized:
            list_exog_test = [self.standardizer.transform(i, copy=True) for i in list_exog_test]
        
        list_cur_mat_log_b = self._calc_emission_mat(list_X_test, list_Y_test)
        list_cur_list_trans_mat = self._est_trans_mat(list_exog_test)

        list_pred = []
        for ii, (cur_mat_log_b, cur_list_trans_mat) in enumerate(zip(list_cur_mat_log_b, list_cur_list_trans_mat)):
            pred = []
            log_last_posterior = self.list_log_pred_posterior[ii]
            for tt in range(list_T[ii]):
                
                log_last_posterior = log_last_posterior + cur_mat_log_b[tt, :]         
                log_last_posterior = matmul_log(np.transpose(cur_list_trans_mat[tt]), log_last_posterior)                    
                log_last_posterior -= logsumexp(log_last_posterior)
                tmp_posterior = np.exp(log_last_posterior)
                
                if tt + 1 < len(list_X_test[ii]):
                    tmp_mean_pred = 0
                    for tmp_k in range(K):
                        tmp_mean_pred += self.list_lr_cov[tmp_k][0].predict([list_X_test[ii][tt + 1]])[0] * tmp_posterior[tmp_k]
                    pred.append(tmp_mean_pred)
                
            self.list_log_pred_posterior[ii] = log_last_posterior
            pred = np.array(pred)
            list_pred.append(pred)
            
        if not is_multiple_sequence:
            list_pred = list_pred[0]
        list_pred = np.array(list_pred)

        res = list_pred
            
        return res
    
    
# def _test_MSLRX():
    
#     start_time = datetime.datetime.now()
#     print("start running _test_MSLRX()")
    
#     np.random.seed(434)
    
#     list_beta = [np.array([[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]), 
#                  np.array([[0.1, 0.5], [0.8, 0.5], [0.2, 0.6]]), 
#                  np.array([[-0.5, 0.7], [-0.5, -0.8], [-0.5, -0.2]]),]
#     list_intercept = [np.array([1., 2.]), np.array([8., 2.]), np.array([5., 3.])]
#     list_cov = [np.array([[1., 0.1], [0.1, 1.]]), np.array([[1.5, 0.1], [0.1, 1.5]]), np.array([[3., 0.1], [0.1, 3.]])]
#     transmat = np.array([[0.9, 0.05,  0.05], [ 0.3, 0.6,  0.1], [ 0.3,  0.2, 0.5]])
#     init_prob = np.array([1 / 3, 1 / 2, 1 / 6])
#     n_components = len(list_beta)
#     K = n_components
#     p_X = len(list_beta[0])
#     p_exog = 5
    
#     list_coef_transmat = [np.random.uniform(size = (p_exog, K), low = -1., high = 1.) for i in range(K)]
#     list_intercept_transmat = [np.random.uniform(size = K) for i in range(K)]
#     for line in list_coef_transmat:
#         line[0, :] = 0
    
#     T_train = 1000
#     T_test = 50
#     T = T_train + T_test
    
#     list_X, list_Y, list_exog = [], [], []
#     for ii in range(10):
        
#         X = np.random.normal(loc=1.0, scale=5.0, size=(T, p_X))
#         exog = np.random.normal(loc=0.0, scale=1.0, size = (T, p_exog))
#         Y, list_h_label = [], []
#         for tt in range(T):
            
#             index = tt
            
#             if index != 0:
#                 trans_mat = []
#                 for ii in range(K):
#                     tmp_log_prob = exog[index - 1] @ list_coef_transmat[ii] + list_intercept_transmat[ii]
#                     tmp_log_prob -= np.max(tmp_log_prob)
#                     tmp_prob = np.exp(tmp_log_prob)
#                     tmp_prob /= np.sum(tmp_prob)
#                     trans_mat.append(tmp_prob)
#                 trans_mat = np.array(trans_mat)
#                 prob_h_tmp = (h_label_prev @ trans_mat)[0]
#             else:
#                 prob_h_tmp = init_prob
            
#             h_label_tmp_vec = np.random.multinomial(n = 1, pvals = prob_h_tmp, size = 1)
#             h_label_tmp = np.argmax(h_label_tmp_vec)
#             tmp_h_label = h_label_tmp
#             h_label_prev = h_label_tmp_vec
            
#             list_h_label.append(tmp_h_label)
            
#             tmp_Y = X[tt, :] @ list_beta[tmp_h_label] + list_intercept[tmp_h_label] + np.random.multivariate_normal(mean = [0, 0], cov = list_cov[tmp_h_label])
#             Y.append(tmp_Y)
            
#         Y = np.array(Y)
        
#         list_X.append(X)
#         list_Y.append(Y)
#         list_exog.append(exog)
        
#     print("data prepared", str(datetime.datetime.now() - start_time))
        
#     mslrx = MSLRX(n_components=3, covariance_type="full", n_iter=10, is_logistic_regression_standardized = True)
#     pred = mslrx.fit_predict(X = [X[:T_train] for X in list_X],
#                              Y = [Y[:T_train] for Y in list_Y],
#                              exog = [exog[:T_train] for exog in list_exog],
#                              X_test = [X[T_train, :] for X in list_X], 
#                              is_multiple_sequence=True,
#                             exog_additional = [exog[:1] for X in list_X],
#                             )
#     print(pred)
#     ol_pred = mslrx.online_predict(X_test = [X[T_train:] for X in list_X],
#                                    Y_test = [Y[T_train:] for Y in list_Y], 
#                                    exog_test = [exog[T_train:] for exog in list_exog],
#                                    is_multiple_sequence = True)
    
#     list_pred = [[] for i in range(len(list_X))]
#     for cc, (tmp_pred, tmp_pred_ol) in enumerate(zip(pred, ol_pred)):
#         list_pred[cc].append(tmp_pred)
#         list_pred[cc].extend(tmp_pred_ol)

#     print("r2 =", r2_score(y_true = np.array([Y[T_train] for Y in list_Y]).flatten(), y_pred=np.array(pred).flatten()))
#     print("r2 =", r2_score(y_true = np.array([Y[T_train:] for Y in list_Y]).flatten(), y_pred=np.array(list_pred).flatten()))
#     print("running time =", str(datetime.datetime.now() - start_time))
    
#     return


# _test_MSLRX()


In [27]:
class MSLRXSoluIII():
    
    def __init__(self, n_components = 2, covariance_type="full", reg_method = "OLS", kargs_reg = None, n_iter = 10, tol_loglik = 0.):
        
        self.n_components = n_components
        self.covariance_type = covariance_type 
        self.n_iter = n_iter
        self.min_iter = 3
        self.tol_loglik = tol_loglik
        self.is_print_loglik = False
        
        if kargs_reg is None:
            kargs_reg = dict()
        self.kargs_reg = kargs_reg
            
        if reg_method in {"OLS", "LR", "LinearRegression", }:
            self.reg_method = LinearRegression
        elif reg_method in {"Ridge", }:
            self.reg_method = Ridge
        elif reg_method in {"RidgeCV", }:
            self.reg_method = RidgeCV
        elif reg_method in {"LinearSVR", "LinearSVM", }:
            self.reg_method = LinearSVR
        else:
            self.reg_method = reg_method
            
        return
    
    
    def _est_trans_mat(self, list_exog):
        
        list_list_trans_mat = []
        for exog in list_exog:
            T, K = len(exog), self.n_components
            list_trans_mat = []
            for tmp_exog in exog:
                tmp_trans_mat = []
                for ii in range(K):
                    tmp_prob_1 = tmp_exog @ self.list_coef_[ii] + self.list_intercept_[ii]
                    tmp_prob_1 = max(min(tmp_prob_1, 1 - 1e-5), 1e-5)
                    tmp_prob_0 = 1 - tmp_prob_1
                    tmp_trans_mat.append([tmp_prob_0, tmp_prob_1])
                list_trans_mat.append(np.array(tmp_trans_mat))
            list_list_trans_mat.append(np.array(list_trans_mat))
            
        return list_list_trans_mat
    
    
    def _update_coef(self, list_array3d_log_epsilon, list_exog):
        
        K = self.n_components
        self.list_coef_, self.list_intercept_ = [], []
        for ii in range(K):
            
            tmp_X, tmp_y, tmp_weight = [], [], []
            tmp_X_nan, tmp_y_nan, tmp_weight_nan = [], [], []
            for array3d_log_epsilon, exog in zip(list_array3d_log_epsilon, list_exog):
                T = len(array3d_log_epsilon)
                for tt in range(T):
                    for jj in range(K):
                        tmp_weight_cell = np.exp(array3d_log_epsilon[tt, ii, jj])
                        if np.isnan(tmp_weight_cell) or np.isinf(tmp_weight_cell):
                            tmp_X_nan.append(exog[tt])
                            tmp_y_nan.append(jj)
                            tmp_weight_nan.append(tmp_weight_cell)
                        else:
                            tmp_X.append(exog[tt])
                            tmp_y.append(jj)
                            tmp_weight.append(tmp_weight_cell)

            if len(tmp_y_nan) >= 1: 
                print("Warning: encountering % s nan in weight in ghmm_exog._update_coef" % len(tmp_y_nan))
                if len(set(tmp_y)) < K:
                    tmp_X, tmp_y, tmp_weight = np.array(tmp_X_nan), np.array(tmp_y_nan), np.array(tmp_weight_nan)
                    tmp_weight = 1
            
            tmp_X = sm.add_constant(tmp_X, has_constant='add', prepend=True)
            res_glm = sm.GLM(exog = tmp_X, endog = tmp_y, family=sm.families.Binomial(link = sm.families.links.identity()), freq_weights = tmp_weight).fit()
            
            tmp_intercept = res_glm.params[0]
            tmp_coef = np.array(res_glm.params[1:])
            
            self.list_coef_.append(tmp_coef) 
            self.list_intercept_.append(tmp_intercept)

        return
    
    
    def fit_predict(self, X, Y, exog, X_test, is_multiple_sequence = False, exog_additional = None):
        
        #matmul_log = lambda A, log_b: np.array([logsumexp(np.log(tmp_line) + log_b) for tmp_line in A])
        
        self.is_fitted = True
        self.is_initialized = False
        
        if is_multiple_sequence:
            list_X, list_Y = [np.array(i) for i in X], [np.array(i) for i in Y]
            list_X_test = [np.array(i) for i in X_test] if X_test is not None else None
            list_exog = [np.array(i) for i in exog]
            list_exog_additional = [np.array(i) for i in exog_additional] if exog_additional is not None else None
        else:
            list_X, list_Y = [np.array(X)], [np.array(Y)]
            list_X_test = [np.array(X_test)] if X_test is not None else None
            list_exog = [np.array(exog)]
            list_exog_additional = [np.array(exog_additional)]
        
        n_seq = len(list_Y)
        list_T = [len(i) for i in list_Y]
        
        p_X, p = len(list_X[0][0]), len(list_Y[0][0])
        p_exog = len(list_exog[0][0])
        K = self.n_components
        self.n_features = p
        
        marginal_X, marginal_Y = np.concatenate(list_X), np.concatenate(list_Y)
        gmm = GaussianMixture(n_components = self.n_components, covariance_type = self.covariance_type, random_state=434)
        gmm.fit(marginal_Y)
        
        self.list_coef_ = [np.zeros(shape = p_exog) for i in range(K)]
        self.list_intercept_ = [1 / K for i in range(K)]
        list_cur_list_trans_mat = self._est_trans_mat(list_exog)

        self.list_loglik_ = []
        for epoch in range(self.n_iter):

            # M-step
            if epoch != 0:
                tmp_weight_mat = []
                for cur_log_forward_prob, cur_log_backward_prob in zip(list_cur_log_forward_prob, list_cur_log_backward_prob):
                    tmp_log_weight_mat = cur_log_forward_prob + cur_log_backward_prob
                    tmp_log_weight_mat -= logsumexp(tmp_log_weight_mat[-1, :])
                    tmp_weight_mat.append(np.exp(tmp_log_weight_mat))
                tmp_weight_mat = np.concatenate(tmp_weight_mat)
            else:
                tmp_weight_mat = gmm.predict_proba(marginal_Y)
                
            list_lr_cov = []
            for ii in range(K):
                tmp_weight = tmp_weight_mat[:, ii]
                tmp_lr = self.reg_method(**self.kargs_reg)
                tmp_lr.fit(marginal_X, marginal_Y, sample_weight = tmp_weight)
                tmp_resid = marginal_Y - tmp_lr.predict(marginal_X)
                if self.covariance_type == "full":
                    tmp_cov_sum = np.zeros(shape = (p, p))
                    for jj in range(len(marginal_X)):
                        tmp_x = tmp_resid[jj, :]
                        tmp_cov_sum += np.outer(tmp_x, tmp_x) * tmp_weight[jj]
                    tmp_cov = tmp_cov_sum / np.sum(tmp_weight) 
                elif self.covariance_type == "diag":
                    tmp_cov = np.array([max(np.sum(tmp_weight * (tmp_resid[:, j] ** 2)) / np.sum(tmp_weight), 1e-12) for j in range(p)])
                list_lr_cov.append((tmp_lr, tmp_cov))
            self.list_lr_cov = list_lr_cov
            
            list_cur_mat_log_b = self._calc_emission_mat(list_X, list_Y)
                        
            if epoch != 0:
                cur_log_initprob = [list_cur_log_forward_prob[i][0] + list_cur_log_backward_prob[i][0] for i in range(n_seq)]
            else:
                cur_log_initprob = [[np.log(i) if i != 0 else -100 for i in line] for line in gmm.predict_proba([Y[0] for Y in list_Y])]
            
            cur_log_initprob = np.sum(cur_log_initprob, axis = 0)
            cur_log_initprob -= logsumexp(cur_log_initprob)
            self.startprob_ = np.exp(cur_log_initprob)
            self.log_startprob_ = cur_log_initprob

            # E-step      
            list_cur_log_forward_prob = [[] for i in range(n_seq)]
            for index_X, X in enumerate(list_X):
                for tt in range(len(X)):
                    if tt == 0:
                        tmp_log_prob = cur_log_initprob + list_cur_mat_log_b[index_X][0, :]
                    else:
                        tmp_log_prob = matmul_log(np.transpose(list_cur_list_trans_mat[index_X][tt - 1]), list_cur_log_forward_prob[index_X][-1]) + list_cur_mat_log_b[index_X][tt, :]
                    list_cur_log_forward_prob[index_X].append(tmp_log_prob)
            list_cur_log_forward_prob = [np.array(i) for i in list_cur_log_forward_prob]
            
            list_cur_log_backward_prob = [[0 for i in range(T)] for T in list_T]
            for index_X, (X, T) in enumerate(zip(list_X, list_T)):
                for tt in range(T - 1, -1, -1):
                    if tt == T - 1:
                        tmp_log_prob = np.array([np.log(1.) for i in range(K)])
                    else:
                        tmp_log_prob = []
                        for ii in range(K):
                            tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
                        tmp_log_prob = np.array(tmp_log_prob)
                    list_cur_log_backward_prob[index_X][tt] = tmp_log_prob
            list_cur_log_backward_prob = [np.array(i) for i in list_cur_log_backward_prob]
            
            list_tmp_array2d_log_gamma = [i + j for (i, j) in zip(list_cur_log_forward_prob, list_cur_log_backward_prob)]
            list_tmp_array1d_log_gamma_sum = [[logsumexp(i[t, :]) for t in range(len(i))] for i in list_tmp_array2d_log_gamma]
            
            list_tmp_array3d_log_epsilon = [np.zeros(shape = (T - 1, K, K)) for T in list_T]
            for index_X, (X, T) in enumerate(zip(list_X, list_T)):
                for tt in range(T - 1):
                    for ii in range(K):
                        for jj in range(K):
                            list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
                                    list_cur_log_backward_prob[index_X][tt + 1, jj] + list_cur_mat_log_b[index_X][tt + 1, jj] - list_tmp_array1d_log_gamma_sum[index_X][tt]            
        
            self._update_coef(list_tmp_array3d_log_epsilon, list_exog) # update coef and intercept for transmat
            list_cur_list_trans_mat = self._est_trans_mat(list_exog)
            self.list_cur_list_trans_mat = list_cur_list_trans_mat
            
            loglik = np.sum([logsumexp(i[-1]) for i in list_cur_log_forward_prob])
            self.list_loglik_.append(loglik)
            
            self.is_initialized = True
            
            if self.is_print_loglik:
                print("epoch % s, loglik = % s" % (epoch, loglik))
            
            if (len(self.list_loglik_) >= 2) and (abs(self.list_loglik_[-1] - self.list_loglik_[-2]) < self.tol_loglik):
                break
        
        self.list_last_posterior = [i[-1] + j[-1] for (i, j) in zip(list_cur_log_backward_prob, list_cur_log_forward_prob)]
        self.list_log_pred_posterior = []

        list_transmat_additional = self._est_trans_mat(list_exog_additional) if list_exog_additional is not None else None
        
        for ii in range(n_seq):
            tmp = self.list_last_posterior[ii]
            tmp -= logsumexp(tmp)
            self.list_last_posterior[ii] = np.exp(tmp)
            
            tmp_transmat = list_cur_list_trans_mat[ii][-1]
            if list_transmat_additional is not None:
                for tmp_transmat_additional in list_transmat_additional[ii]:
                    tmp_transmat = tmp_transmat @ tmp_transmat_additional
            self.list_log_pred_posterior.append(matmul_log(np.transpose(tmp_transmat), tmp))
            
        if list_X_test is None:
            pred = None
        else:
            pred = []
            for ii in range(n_seq):
                tmp_pred_prob = np.exp(self.list_log_pred_posterior[ii])
                tmp_pred = 0
                for tmp_k in range(K):
                    tmp_pred += list_lr_cov[tmp_k][0].predict([list_X_test[ii]])[0] * tmp_pred_prob[tmp_k]
                pred.append(tmp_pred)

            if not is_multiple_sequence:
                pred = pred[0]
        
        return pred
    
    
    def fit(self, X, Y, exog, is_multiple_sequence = False):
        self.fit_predict(X = X, Y = Y, exog = exog, X_test = None, is_multiple_sequence = is_multiple_sequence)
        return
    
    
    def _calc_emission_mat(self, list_X, list_Y):
        
        list_T = [len(i) for i in list_X]
        p, K = self.n_features, self.n_components
        
        list_cur_mat_log_b = [np.zeros(shape = (i, K)) for i in list_T]
        for ii, (tmp_lr, tmp_cov) in enumerate(self.list_lr_cov):
            for index_X, (X, Y) in enumerate(zip(list_X, list_Y)):
                tmp_resid = Y - tmp_lr.predict(X)
                if self.covariance_type == "full":
                    list_cur_mat_log_b[index_X][:, ii] = scipy.stats.multivariate_normal.logpdf(tmp_resid, mean=[0] * p, cov=tmp_cov)
                elif self.covariance_type == "diag":
                    tmp_logprob = 0
                    for jj in range(p):
                        tmp_logprob += scipy.stats.norm.logpdf(tmp_resid[:, jj], loc = 0, scale = np.sqrt(tmp_cov[jj]))
                    list_cur_mat_log_b[index_X][:, ii] = tmp_logprob

        return list_cur_mat_log_b
    
    
    def online_predict(self, X_test, Y_test, exog_test, is_multiple_sequence = False):
        
        if not self.is_fitted:
            print("Error: Not fitted. MSLRX's online_predict method must be used AFTER fit")
            return
            
        #matmul_log = lambda A, log_b: np.array([logsumexp(np.log(tmp_line) + log_b) for tmp_line in A])
        
        if is_multiple_sequence:
            list_X_test, list_Y_test = [np.array(i) for i in X_test], [np.array(i) for i in Y_test]  
            list_exog_test = [np.array(i) for i in exog_test] 
        else:
            list_X_test, list_Y_test = [np.array(X_test)], [np.array(Y_test)] 
            list_exog_test = [np.array(exog_test)] 

        list_T = [len(i) for i in list_Y_test]
        K = self.n_components
        
        list_cur_mat_log_b = self._calc_emission_mat(list_X_test, list_Y_test)
        list_cur_list_trans_mat = self._est_trans_mat(list_exog_test)

        list_pred = []
        for ii, (cur_mat_log_b, cur_list_trans_mat) in enumerate(zip(list_cur_mat_log_b, list_cur_list_trans_mat)):
            pred = []
            log_last_posterior = self.list_log_pred_posterior[ii]
            for tt in range(list_T[ii]):
                
                log_last_posterior = log_last_posterior + cur_mat_log_b[tt, :]         
                log_last_posterior = matmul_log(np.transpose(cur_list_trans_mat[tt]), log_last_posterior)                    
                log_last_posterior -= logsumexp(log_last_posterior)
                tmp_posterior = np.exp(log_last_posterior)
                
                if tt + 1 < len(list_X_test[ii]):
                    tmp_mean_pred = 0
                    for tmp_k in range(K):
                        tmp_mean_pred += self.list_lr_cov[tmp_k][0].predict([list_X_test[ii][tt + 1]])[0] * tmp_posterior[tmp_k]
                    pred.append(tmp_mean_pred)
                
            self.list_log_pred_posterior[ii] = log_last_posterior
            pred = np.array(pred)
            list_pred.append(pred)
            
        if not is_multiple_sequence:
            list_pred = list_pred[0]
        list_pred = np.array(list_pred)

        res = list_pred
            
        return res
    
    
# def _test_MSLRXSoluIII():
    
#     start_time = datetime.datetime.now()
#     print("start running _test_MSLRXSoluIII()")
    
#     np.random.seed(434)
    
#     list_beta = [np.array([[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]), 
#                  np.array([[-0.5, 0.7], [-0.5, -0.8], [-0.5, -0.2]]),]
#     list_intercept = [np.array([1., 2.]), np.array([8., 2.])]
#     list_cov = [np.array([[1., 0.1], [0.1, 1.]]), np.array([[3., 0.1], [0.1, 3.]])]
#     transmat = np.array([[0.9, 0.1], [ 0.3, 0.7]])
#     init_prob = np.array([1 / 3, 2 / 3])
#     n_components = len(list_beta)
#     K = n_components
#     p_X = len(list_beta[0])
#     p_exog = 5
    
#     list_coef_transmat = [np.random.uniform(size = (p_exog, K), low = -1., high = 1.) for i in range(K)]
#     list_intercept_transmat = [np.random.uniform(size = K) for i in range(K)]
#     for line in list_coef_transmat:
#         line[0, :] = 0
    
#     T_train = 1000
#     T_test = 50
#     T = T_train + T_test
    
#     list_X, list_Y, list_exog = [], [], []
#     for ii in range(100):
        
#         X = np.random.normal(loc=1.0, scale=5.0, size=(T, p_X))
#         exog = np.random.normal(loc=0.0, scale=1.0, size = (T, p_exog))
#         Y, list_h_label = [], []
#         for tt in range(T):
            
#             index = tt
            
#             if index != 0:
#                 trans_mat = []
#                 for ii in range(K):
#                     tmp_log_prob = exog[index - 1] @ list_coef_transmat[ii] + list_intercept_transmat[ii]
#                     tmp_log_prob -= np.max(tmp_log_prob)
#                     tmp_prob = np.exp(tmp_log_prob)
#                     tmp_prob /= np.sum(tmp_prob)
#                     trans_mat.append(tmp_prob)
#                 trans_mat = np.array(trans_mat)
#                 prob_h_tmp = (h_label_prev @ trans_mat)[0]
#             else:
#                 prob_h_tmp = init_prob
            
#             h_label_tmp_vec = np.random.multinomial(n = 1, pvals = prob_h_tmp, size = 1)
#             h_label_tmp = np.argmax(h_label_tmp_vec)
#             tmp_h_label = h_label_tmp
#             h_label_prev = h_label_tmp_vec
            
#             list_h_label.append(tmp_h_label)
            
#             tmp_Y = X[tt, :] @ list_beta[tmp_h_label] + list_intercept[tmp_h_label] + np.random.multivariate_normal(mean = [0, 0], cov = list_cov[tmp_h_label])
#             Y.append(tmp_Y)
            
#         Y = np.array(Y)
        
#         list_X.append(X)
#         list_Y.append(Y)
#         list_exog.append(exog)
        
#     print("data prepared", str(datetime.datetime.now() - start_time))
        
#     mslrx = MSLRXSoluIII(n_components=2, covariance_type="full", n_iter=10)
#     pred = mslrx.fit_predict(X = [X[:T_train] for X in list_X],
#                              Y = [Y[:T_train] for Y in list_Y],
#                              exog = [exog[:T_train] for exog in list_exog],
#                              X_test = [X[T_train, :] for X in list_X], 
#                              is_multiple_sequence=True,
#                             exog_additional=[exog[:1] for X in list_X])
#     print(pred)
#     ol_pred = mslrx.online_predict(X_test = [X[T_train:] for X in list_X],
#                                    Y_test = [Y[T_train:] for Y in list_Y], 
#                                    exog_test = [exog[T_train:] for exog in list_exog],
#                                    is_multiple_sequence = True)
    
#     list_pred = [[] for i in range(len(list_X))]
#     for cc, (tmp_pred, tmp_pred_ol) in enumerate(zip(pred, ol_pred)):
#         list_pred[cc].append(tmp_pred)
#         list_pred[cc].extend(tmp_pred_ol)

#     print("r2 =", r2_score(y_true = np.array([Y[T_train] for Y in list_Y]).flatten(), y_pred=np.array(pred).flatten()))
#     print("r2 =", r2_score(y_true = np.array([Y[T_train:] for Y in list_Y]).flatten(), y_pred=np.array(list_pred).flatten()))
#     print("running time =", str(datetime.datetime.now() - start_time))
    
#     return


# _test_MSLRXSoluIII()


In [11]:
class SMap:
    
    def __init__(self, theta = 0., reg_method = "OLS", kargs_reg = None):
        
        self.theta = theta
        
        if kargs_reg is None:
            kargs_reg = dict()
        self.kargs_reg = kargs_reg
            
        if reg_method in {"OLS", "LR", "LinearRegression", }:
            self.reg_method = LinearRegression
        elif reg_method in {"Ridge", }:
            self.reg_method = Ridge
        elif reg_method in {"RidgeCV", }:
            self.reg_method = RidgeCV
        elif reg_method in {"LinearSVR", "LinearSVM", }:
            self.reg_method = LinearSVR
        else:
            self.reg_method = reg_method
            
        return
    
    
    def fit(self, X, y, sample_weight = None):
        self.X, self.y = np.array(X), np.array(y)
        if sample_weight is None:
            self.sample_weight = np.array([1. for i in range(len(self.X))])
        else:
            self.sample_weight = np.array(sample_weight)
        return
    
    
    def predict(self, X):
        
        X_train, y_train = self.X, self.y
        sample_weight_train = self.sample_weight
        theta = self.theta
        reg_method, kargs_reg = self.reg_method, self.kargs_reg
        
        y_pred = []
        for tmp_X_test in X:
            tmp_X_test = np.array(tmp_X_test)
            tmp_d_vec = np.sqrt(np.sum((X_train - tmp_X_test) ** 2, axis = 1))
            tmp_sample_weight = np.exp(- theta * tmp_d_vec / tmp_d_vec.mean())
            tmp_sample_weight *= sample_weight_train
            tmp_reg = reg_method(**kargs_reg)
            tmp_reg.fit(X_train, y_train, sample_weight = tmp_sample_weight)
            tmp_pred = tmp_reg.predict([tmp_X_test])[0]
            y_pred.append(tmp_pred)
            
        y_pred = np.array(y_pred)
        return y_pred
            
        
# def _test_SMap():
    
#     N, p, n_test = 10000, 5, 1000
#     beta = np.random.uniform(size = p)
#     X = np.random.normal(size = (N, p))
#     y = X @ beta + 2 + np.random.normal(size = N)
    
#     reg = SMap(theta = 0.5)
#     reg.fit(X[:-n_test], y[:-n_test])
#     y_pred = reg.predict(X[-n_test:])
#     print("r2 =", r2_score(y_true=y[-n_test:], y_pred = y_pred))
    
#     return
    
    
# _test_SMap()        
        

In [12]:
class SMapCV:
    
    def __init__(self, thetas = (0.0, 0.5, 1.0, 1.5, 2.0, ), reg_method = "OLS", kargs_reg = None):
        
        self.thetas = thetas
        
        if kargs_reg is None:
            kargs_reg = dict()
        self.kargs_reg = kargs_reg
            
        if reg_method in {"OLS", "LR", "LinearRegression", }:
            self.reg_method = LinearRegression
        elif reg_method in {"Ridge", }:
            self.reg_method = Ridge
        elif reg_method in {"RidgeCV", }:
            self.reg_method = RidgeCV
        elif reg_method in {"LinearSVR", "LinearSVM", }:
            self.reg_method = LinearSVR
        else:
            self.reg_method = reg_method
            
        return
    
    
    def fit(self, X, y, sample_weight = None):
        
        X, y = np.array(X), np.array(y)
        self.X, self.y = X, y
        if sample_weight is None:
            sample_weight = np.array([1. for i in range(len(self.X))])
        else:
            sample_weight = np.array(sample_weight)
        self.sample_weight = sample_weight
        
        dict_theta_pred = {i: [] for i in self.thetas}
        y_true = []
        sample_weight_true = []
        
        kf = KFold(n_splits=10, shuffle = True, random_state = 434)
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            sample_weight_train, sample_weight_test = sample_weight[train_index], sample_weight[test_index]
            
            y_true.extend(y_test)
            sample_weight_true.extend(sample_weight_test)
            
            for theta in self.thetas:
                tmp_smap = SMap(theta = theta, reg_method = self.reg_method, kargs_reg = self.kargs_reg)
                tmp_smap.fit(X_train, y_train, sample_weight = sample_weight_train)
                y_pred = tmp_smap.predict(X_test)
                dict_theta_pred[theta].extend(y_pred)
                
        list_theta_r2 = []
        for theta in self.thetas:
            tmp_r2 = r2_score(y_true = y_true, y_pred = dict_theta_pred[theta], sample_weight = sample_weight_true)
            list_theta_r2.append((theta, tmp_r2))
        list_theta_r2.sort(key = lambda x: x[1], reverse = True)        
        theta_opt = list_theta_r2[0][0]
        
        self.theta = theta_opt
        
        return
    
    
    def predict(self, X):
        
        X_train, y_train = self.X, self.y
        theta = self.theta
        reg_method, kargs_reg = self.reg_method, self.kargs_reg
        
        y_pred = []
        for tmp_X_test in X:
            tmp_X_test = np.array(tmp_X_test)
            tmp_d_vec = np.sqrt(np.sum((X_train - tmp_X_test) ** 2, axis = 1))
            tmp_sample_weight = np.exp(- theta * tmp_d_vec / tmp_d_vec.mean())
            tmp_reg = reg_method(**kargs_reg)
            tmp_reg.fit(X_train, y_train, sample_weight = tmp_sample_weight)
            tmp_pred = tmp_reg.predict([tmp_X_test])[0]
            y_pred.append(tmp_pred)
            
        y_pred = np.array(y_pred)
        return y_pred
            
        
# def _test_SMapCV():
    
#     N, p, n_test = 1000, 5, 100
#     beta = np.random.uniform(size = p)
#     X = np.random.normal(size = (N, p))
#     y = X @ beta + 2 + np.random.normal(size = N)
    
#     reg = SMapCV()
#     reg.fit(X[:-n_test], y[:-n_test])
#     y_pred = reg.predict(X[-n_test:])
#     print("theta = ", reg.theta)
#     print("r2 =", r2_score(y_true=y[-n_test:], y_pred = y_pred))
    
#     return
    
    
# _test_SMapCV()        
        

## Predictions (WW)

In [13]:
# data_WW_byday = data_WW[["date", "conductivity", "turbidity"]].resample("1D").max()
# data_WW_byday["temperature"] = data_WW["temperature"].resample("1D").mean()
# data_WW_byday["pH"] = data_WW["pH"].resample("1D").mean()
# data_WW_byday["ODO"] = data_WW["ODO"].resample("1D").min()
# data_WW_byday["salinity_max"] = data_WW["salinity"].resample("1D").max()
# data_WW_byday["salinity_min"] = data_WW["salinity"].resample("1D").min()
# data_WW_byday["log10_chlorophyll"] = data_WW["log10_chlorophyll"].resample("1D").max()
# data_WW_byday["chlorophyll"] = data_WW["chlorophyll"].resample("1D").max()

data_WW_byday = data_WW[["temperature", "pH"]].resample("1D").mean()
data_WW_byday["date"] = data_WW["date"].resample("1D").max()
data_WW_byday["conductivity"] = data_WW["conductivity"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)
data_WW_byday["turbidity"] = data_WW["turbidity"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)
data_WW_byday["ODO"] = data_WW["ODO"].resample("1D").apply(lambda x: x.nsmallest(2).iloc[-1] if len(x) > 0 else np.nan)
data_WW_byday["salinity_max"] = data_WW["salinity"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)
data_WW_byday["salinity_min"] = data_WW["salinity"].resample("1D").apply(lambda x: x.nsmallest(2).iloc[-1] if len(x) > 0 else np.nan)
data_WW_byday["log10_chlorophyll"] = data_WW["log10_chlorophyll"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)
data_WW_byday["chlorophyll"] = data_WW["chlorophyll"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)


In [14]:
data_W_byday = data_W[["date"]].resample("1D").max()
#data_W_byday["airpressure"] = data_W["airpressure"].resample("1D").mean()
data_W_byday["wind"] = data_W["wind"].resample("1D").mean()

data_WW_byday = data_WW_byday.merge(data_W_byday, left_index=True, right_index=True).rename(columns = {"date_x": "date"}).drop(columns = ["date_y"])


In [15]:
data_WW_byday = data_WW_byday.dropna().resample("1D").max()

In [16]:
data_WW_byday

Unnamed: 0_level_0,temperature,pH,date,conductivity,turbidity,ODO,salinity_max,salinity_min,log10_chlorophyll,chlorophyll,wind
time_min,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-10-12,24.194146,7.902439,2018-10-12,46.755,44.39,6.32,30.38,28.48,1.285107,19.28,2.774167
2018-10-13,22.100917,7.863437,2018-10-13,46.714,26.13,5.89,30.38,28.42,1.426511,26.70,2.256528
2018-10-14,20.642937,7.871354,2018-10-14,46.585,16.85,6.08,30.31,28.71,1.306211,20.24,1.743472
2018-10-15,21.330021,7.870104,2018-10-15,46.370,15.62,6.33,30.15,28.73,1.272538,18.73,4.257870
2018-10-16,21.417677,7.843646,2018-10-16,46.481,15.39,5.69,30.23,28.61,1.211921,16.29,2.117500
...,...,...,...,...,...,...,...,...,...,...,...
2022-12-20,5.802531,7.942708,2022-12-20,49.090,5.30,9.53,31.54,28.73,0.413300,2.59,1.510417
2022-12-21,5.474958,7.948021,2022-12-21,49.624,8.61,9.63,31.87,29.18,0.456366,2.86,2.171071
2022-12-22,5.977583,7.944375,2022-12-22,49.733,25.74,9.74,31.99,29.53,0.481443,3.03,4.545590
2022-12-23,7.366091,7.911136,2022-12-23,48.814,154.58,9.13,31.47,28.91,0.747412,5.59,5.560694


In [17]:
def _main(data_WW_byday, is_log = True, is_cross = True, is_phase = True, horizon_forecast = 1, thres_quantile = 0.95, n_seq_warmup = 6,
          reg_method = "LinearSVR", reg_method_func = LinearSVR, kargs_reg = {"random_state": 434},
          #reg_method = "LinearRegression", reg_method_func = LinearRegression, kargs_reg = {},
         ):
    
    # Data Preparation
    if is_log:
        array_obs = data_WW_byday["log10_chlorophyll"].to_numpy()
    else:
        array_obs = data_WW_byday["chlorophyll"].to_numpy()

    array_datetime = data_WW_byday["date"].to_numpy()
    #array_cross = data_WW_byday[["salinity_max", "salinity_min", "ODO", "turbidity", "pH", "temperature"]].to_numpy()
    array_cross = data_WW_byday[["salinity_max", "salinity_min", "ODO", "turbidity", "pH", "temperature", "wind"]].to_numpy()
    
    list_split_index = list(np.where(np.isnan(array_obs))[0])
    list_split_index.append(len(array_obs))
    list_split_index.insert(0, 0)
    list_subseq, list_subseq_cross,list_subseq_datetime = [], [], []
    for tmp_index in range(len(list_split_index) - 1):
        tmp_index_left = list_split_index[tmp_index] + 1
        tmp_index_right = list_split_index[tmp_index + 1]
        if tmp_index_right - tmp_index_left < 21:
            continue
        list_subseq.append(array_obs[tmp_index_left:tmp_index_right])
        list_subseq_datetime.append(array_datetime[tmp_index_left:tmp_index_right])
        list_subseq_cross.append(array_cross[tmp_index_left:tmp_index_right])
    
    list_p_AR = [2, 1, ]
    list_p_AR_cross = [1, ]
    list_p_AR.sort(reverse = True)
    max_p = max(max(list_p_AR), max(list_p_AR_cross) if len(list_p_AR_cross) != 0 else 0)

    list_X, list_Y, list_exog = [], [], []
    list_date = []
    for tmp_subseq, tmp_subseq_datetime, tmp_subseq_cross in zip(list_subseq, list_subseq_datetime, list_subseq_cross):
        X, Y, exog = [], [], []
        tmp_list_date = []
        for tmp_index_right in range(max_p, len(tmp_subseq) - horizon_forecast + 1):

            tmp_index_test = tmp_index_right - 1 + horizon_forecast # tmp_index_right == tmp_index_test if horizon_forecast = 1

            tmp_X = []
            if is_cross:
                for tmp_p in list_p_AR_cross:
                    tmp_X.extend(tmp_subseq_cross[tmp_index_right - tmp_p])
            for tmp_p in list_p_AR:
                tmp_X.append(tmp_subseq[tmp_index_right - tmp_p])
            X.append(tmp_X)
            Y.append([tmp_subseq[tmp_index_test]])

            tmp_date= tmp_subseq_datetime[tmp_index_test]
            tmp_list_date.append(tmp_date)
            
            tmp_t = pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).toordinal() - pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).replace(month = 1, day = 1).toordinal()
            if is_phase:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi), np.sin(tmp_t / 365 * 2 * np.pi)]
            else:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi)]
            exog.append(tmp_exog)

        list_X.append(X)
        list_Y.append(Y)
        list_exog.append(exog)
        list_date.append(tmp_list_date)
    
    print("Subseqs:")
    for tmp_index in range(len(list_subseq)):
        print("\t", tmp_index, pd.Timestamp(list_subseq_datetime[tmp_index][1]).date(), pd.Timestamp(list_subseq_datetime[tmp_index][-1]).date(), len(list_subseq[tmp_index]))
    print()
    
    # Prediction
    
    start_time = datetime.datetime.now()

    list_y_true, list_y_pred_naive = [], []
    list_y_pred_mslrx, list_y_pred_mslr = [], []
    list_y_pred_mslrxsolu3 = []
    list_y_date = []
    for cc in range(n_seq_warmup, len(list_X)):
        tmp_list_y_true, tmp_list_y_pred_naive = [], []
        tmp_list_y_pred_mslrx, tmp_list_y_pred_mslr = [], []
        tmp_list_y_pred_mslrxsolu3 = []
        tmp_list_y_date = []
        for tmp_index_test in range(5, len(list_X[cc])):
            tmp_list_X, tmp_list_Y, tmp_list_exog = [], [], []
            tmp_list_X_test = []
            for ii in range(cc):
                tmp_list_X.append(copy.deepcopy(list_X[ii]))
                tmp_list_Y.append(copy.deepcopy(list_Y[ii]))
                tmp_list_exog.append(copy.deepcopy(list_exog[ii]))
                tmp_list_X_test.append(copy.deepcopy(list_X[ii][-1]))
            tmp_list_X.append(copy.deepcopy(list_X[cc][:tmp_index_test]))
            tmp_list_Y.append(copy.deepcopy(list_Y[cc][:tmp_index_test]))
            tmp_list_exog.append(copy.deepcopy(list_exog[cc][:tmp_index_test]))
            tmp_list_X_test.append(copy.deepcopy(list_X[cc][tmp_index_test]))

            mslrx = MSLRX(n_components = 2, covariance_type="diag", n_iter = 10, is_logistic_regression_CV = False,
                          logistic_regression_C = 1e10, is_logistic_regression_standardized = False, reg_method = reg_method, kargs_reg = kargs_reg)
            tmp_pred = mslrx.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_exog, tmp_list_X_test, is_multiple_sequence = True)
            tmp_y_pred = tmp_pred[-1]
            tmp_list_y_pred_mslrx.append(tmp_y_pred)

            mslrx = MSLRXSoluIII(n_components = 2, covariance_type="diag", n_iter = 10, reg_method = reg_method, kargs_reg = kargs_reg)
            tmp_pred = mslrx.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_exog, tmp_list_X_test, is_multiple_sequence = True)
            tmp_y_pred = tmp_pred[-1]
            tmp_list_y_pred_mslrxsolu3.append(tmp_y_pred)

            mslr = MSLR(n_components = 2, covariance_type="diag", n_iter = 10, reg_method = reg_method, kargs_reg = kargs_reg)
            tmp_pred = mslr.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_X_test, is_multiple_sequence = True)
            tmp_y_pred = tmp_pred[-1]
            tmp_list_y_pred_mslr.append(tmp_y_pred)

            tmp_y_true = list_Y[cc][tmp_index_test]
            #tmp_y_pred_naive = list_Y[cc][tmp_index_test - 1]
            tmp_y_pred_naive = list_X[cc][tmp_index_test][-1]
            tmp_list_y_true.append(tmp_y_true)
            tmp_list_y_pred_naive.append(tmp_y_pred_naive)
            tmp_list_y_date.append(list_date[cc][tmp_index_test])

        list_y_true.append(tmp_list_y_true)
        list_y_pred_naive.append(tmp_list_y_pred_naive)
        list_y_pred_mslrx.append(tmp_list_y_pred_mslrx)
        list_y_pred_mslr.append(tmp_list_y_pred_mslr)
        list_y_pred_mslrxsolu3.append(tmp_list_y_pred_mslrxsolu3)
        list_y_date.append(tmp_list_y_date)

        print("seq % s finished" % cc, str(datetime.datetime.now() - start_time))

        #if cc > 5: break
        
    if is_log:
        thres = np.nanquantile(np.power(10, array_obs), thres_quantile)
    else:
        thres = np.nanquantile(array_obs, thres_quantile)

    list_p_AR = [2, 1, ]
    list_p_AR_cross = [1, ]
    list_p_AR.sort(reverse = True)
    
    if is_cross:
        max_p = max(max(list_p_AR), max(list_p_AR_cross) if len(list_p_AR_cross) != 0 else 0)
    else:
        max_p = max(list_p_AR)
    
    dict_warmup_evaluation = dict()
    for n_seq_exclude_valid in range(n_seq_warmup, len(list_subseq)):
        
        print()
        print("n_seq_warmup =", n_seq_exclude_valid)
        
        list_res_logitcos_cross = []
        list_res_pred = []
        for start_index in range(7, 20):

            print(start_index)

            X, Y = [], []
            list_index_test_set = []
            list_date_test = []
            for cc, (tmp_subseq, tmp_subseq_datetime, tmp_subseq_cross) in enumerate(zip(list_subseq, list_subseq_datetime, list_subseq_cross)):
                for tmp_index_right in range(max_p, len(tmp_subseq) - horizon_forecast + 1):
                    tmp_index_test = tmp_index_right - 1 + horizon_forecast # tmp_index_right == tmp_index_test if horizon_forecast = 1
                    tmp_X = []
                    if is_cross:
                        for tmp_p in list_p_AR_cross:
                            tmp_X.extend(tmp_subseq_cross[tmp_index_right - tmp_p])
                    for tmp_p in list_p_AR:
                        tmp_X.append(tmp_subseq[tmp_index_right - tmp_p])
                    X.append(tmp_X)
                    Y.append([tmp_subseq[tmp_index_test]])

                    if cc >= n_seq_exclude_valid and tmp_index_right >= start_index:
                        list_index_test_set.append(len(Y) - 1)
                        list_date_test.append(tmp_subseq_datetime[tmp_index_test])

            y_true, y_pred = [], []
            y_pred_naive = []
            for tmp_index in list_index_test_set:
                lr = reg_method_func(**kargs_reg)
                #lr = SMap(theta = 0.5, reg_method = reg_method, kargs_reg = kargs_reg)
                #lr = SMap(theta = 0.5)
                lr.fit(X[:tmp_index], Y[:tmp_index])
                tmp_y_pred = lr.predict([X[tmp_index]])[0]
                tmp_y_true = Y[tmp_index]
                tmp_y_pred_naive = X[tmp_index][-1]
                y_true.append(tmp_y_true)
                y_pred.append(tmp_y_pred)
                y_pred_naive.append(tmp_y_pred_naive)

            if is_log:
                y_true = np.power(10, y_true)
                y_pred_naive = np.power(10, y_pred_naive)
                y_pred = np.power(10, y_pred)

            tmp_r2_ar, tmp_r2_naive = r2_score(y_true = y_true, y_pred = y_pred), r2_score(y_true = y_true, y_pred = y_pred_naive)
            tmp_rocauc_ar, tmp_rocauc_naive = roc_auc_score(y_true = np.array(y_true) > thres, y_score=y_pred), roc_auc_score(y_true = np.array(y_true) > thres, y_score=y_pred_naive)
            y_pred_ar = y_pred
            print("AR", tmp_r2_ar, tmp_r2_naive, tmp_rocauc_ar, tmp_rocauc_naive)
            
            y_true, y_pred = [], []
            y_pred_naive = []
            for tmp_index in list_index_test_set:
                #lr = reg_method_func(**kargs_reg)
                lr = SMap(theta = 0.5, reg_method = reg_method, kargs_reg = kargs_reg)
                #lr = SMap(theta = 0.5)
                lr.fit(X[:tmp_index], Y[:tmp_index])
                tmp_y_pred = lr.predict([X[tmp_index]])[0]
                tmp_y_true = Y[tmp_index]
                tmp_y_pred_naive = X[tmp_index][-1]
                y_true.append(tmp_y_true)
                y_pred.append(tmp_y_pred)
                y_pred_naive.append(tmp_y_pred_naive)

            if is_log:
                y_true = np.power(10, y_true)
                y_pred_naive = np.power(10, y_pred_naive)
                y_pred = np.power(10, y_pred)

            tmp_r2_smap, tmp_r2_naive = r2_score(y_true = y_true, y_pred = y_pred), r2_score(y_true = y_true, y_pred = y_pred_naive)
            tmp_rocauc_smap, tmp_rocauc_naive = roc_auc_score(y_true = np.array(y_true) > thres, y_score=y_pred), roc_auc_score(y_true = np.array(y_true) > thres, y_score=y_pred_naive)
            y_pred_smap = y_pred
            print("SMap", tmp_r2_smap, tmp_r2_naive, tmp_rocauc_smap, tmp_rocauc_naive)

            y_true, y_pred = [], []
            y_pred_naive = []
            for cc in range(n_seq_exclude_valid - n_seq_warmup, len(list_y_true)):
                for tmp_index_test in range(start_index - 5 - max_p, len(list_y_true[cc])):
                    y_true.append(list_y_true[cc][tmp_index_test])
                    y_pred.append(list_y_pred_mslrx[cc][tmp_index_test])
                    y_pred_naive.append(list_y_pred_naive[cc][tmp_index_test])

            if is_log:
                y_true = np.power(10, y_true)
                y_pred_naive = np.power(10, y_pred_naive)
                y_pred = np.power(10, y_pred)

            tmp_r2_mslrx, tmp_r2_naive = r2_score(y_true = y_true, y_pred = y_pred), r2_score(y_true = y_true, y_pred = y_pred_naive)
            tmp_rocauc_mslrx, tmp_rocauc_naive = roc_auc_score(y_true = np.array(y_true) > thres, y_score=y_pred), roc_auc_score(y_true = np.array(y_true) > thres, y_score=y_pred_naive)
            y_pred_mslrx = y_pred
            print("MSLRX", tmp_r2_mslrx, tmp_r2_naive, tmp_rocauc_mslrx, tmp_rocauc_naive)

            y_true, y_pred = [], []
            y_pred_naive = []
            for cc in range(n_seq_exclude_valid - n_seq_warmup, len(list_y_true)):
                for tmp_index_test in range(start_index - 5 - max_p, len(list_y_true[cc])):
                    y_true.append(list_y_true[cc][tmp_index_test])
                    y_pred.append(list_y_pred_mslrxsolu3[cc][tmp_index_test])
                    y_pred_naive.append(list_y_pred_naive[cc][tmp_index_test])

            if is_log:
                y_true = np.power(10, y_true)
                y_pred_naive = np.power(10, y_pred_naive)
                y_pred = np.power(10, y_pred)

            tmp_r2_mslrxsolu3, tmp_r2_naive = r2_score(y_true = y_true, y_pred = y_pred), r2_score(y_true = y_true, y_pred = y_pred_naive)
            tmp_rocauc_mslrxsolu3, tmp_rocauc_naive = roc_auc_score(y_true = np.array(y_true) > thres, y_score=y_pred), roc_auc_score(y_true = np.array(y_true) > thres, y_score=y_pred_naive)
            y_pred_mslrxsolu3 = y_pred
            print("MXLRXSolu3", tmp_r2_mslrxsolu3, tmp_r2_naive, tmp_rocauc_mslrxsolu3, tmp_rocauc_naive)

            y_true, y_pred = [], []
            y_pred_naive = []
            for cc in range(n_seq_exclude_valid - n_seq_warmup, len(list_y_true)):
                for tmp_index_test in range(start_index - 5 - max_p, len(list_y_true[cc])):
                    y_true.append(list_y_true[cc][tmp_index_test])
                    y_pred.append(list_y_pred_mslr[cc][tmp_index_test])
                    y_pred_naive.append(list_y_pred_naive[cc][tmp_index_test])

            if is_log:
                y_true = np.power(10, y_true)
                y_pred_naive = np.power(10, y_pred_naive)
                y_pred = np.power(10, y_pred)

            tmp_r2_mslr, tmp_r2_naive = r2_score(y_true = y_true, y_pred = y_pred), r2_score(y_true = y_true, y_pred = y_pred_naive)
            tmp_rocauc_mslr, tmp_rocauc_naive = roc_auc_score(y_true = np.array(y_true) > thres, y_score=y_pred), roc_auc_score(y_true = np.array(y_true) > thres, y_score=y_pred_naive)
            y_pred_mslr = y_pred
            print("MSLR", tmp_r2_mslr, tmp_r2_naive, tmp_rocauc_mslr, tmp_rocauc_naive)

            list_res_logitcos_cross.append((tmp_r2_naive, tmp_rocauc_naive,
                                            tmp_r2_ar, tmp_rocauc_ar, 
                                            tmp_r2_smap, tmp_rocauc_smap,
                                            tmp_r2_mslrx, tmp_rocauc_mslrx, 
                                            tmp_r2_mslrxsolu3, tmp_rocauc_mslrxsolu3, 
                                            tmp_r2_mslr, tmp_rocauc_mslr,
                                           ))
            
            list_res_pred.append((list_date_test, y_true, y_pred_naive, y_pred_ar, y_pred_smap, y_pred_mslrx, y_pred_mslrxsolu3, y_pred_mslr))
            
        dict_warmup_evaluation[n_seq_exclude_valid] = (list_res_logitcos_cross, list_res_pred)
                
    return dict_warmup_evaluation


In [18]:

dict_setting_results = dict()
for is_log in [True, False]:
    for is_cross in [True, False]:
        for is_phase in [True]:
            for horizon_forecast in [1, 3, 7]:
                
                tmp_setting = (is_log, is_cross, is_phase, horizon_forecast, "LinearRegression", )
                
                print()
                print("================================================")
                print("setting =", tmp_setting)
                print("================================================")
                
                dict_setting_results[tmp_setting] = _main(data_WW_byday, n_seq_warmup=1, 
                                                          #reg_method = "LinearSVR", reg_method_func = LinearSVR, kargs_reg = {"random_state": 434},
                                                          reg_method = "LinearRegression", reg_method_func = LinearRegression, kargs_reg = {},
                                                          is_log = is_log, is_cross = is_cross, is_phase = is_phase, horizon_forecast = horizon_forecast,)
                 
                



setting = (True, True, True, 1, 'LinearRegression')
Subseqs:
	 0 2018-11-01 2019-01-22 84
	 1 2019-02-02 2019-03-24 52
	 2 2019-03-27 2019-06-24 91
	 3 2019-06-27 2019-07-25 30
	 4 2019-08-02 2019-11-16 108
	 5 2019-11-20 2020-02-12 86
	 6 2020-02-15 2020-03-27 43
	 7 2020-03-31 2020-07-16 109
	 8 2020-09-04 2020-09-25 23
	 9 2021-02-09 2021-03-07 28
	 10 2021-04-01 2021-06-28 90
	 11 2021-07-01 2021-07-21 22
	 12 2021-07-29 2021-08-31 35
	 13 2021-09-03 2021-10-27 56
	 14 2021-10-30 2022-01-01 65
	 15 2022-01-15 2022-03-26 72
	 16 2022-03-29 2022-07-23 118
	 17 2022-07-28 2022-12-09 136

seq 1 finished 0:01:56.279849
seq 2 finished 0:06:18.172052
seq 3 finished 0:07:51.427150
seq 4 finished 0:15:22.131946
seq 5 finished 0:23:22.898824
seq 6 finished 0:27:06.761482
seq 7 finished 0:39:06.514548
seq 8 finished 0:41:09.387574
seq 9 finished 0:43:56.282638
seq 10 finished 0:55:55.980854
seq 11 finished 0:58:13.892854
seq 12 finished 1:02:40.704355
seq 13 finished 1:10:51.684142
seq 14 fi

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:07:44.587000
seq 2 finished 0:19:24.542000
seq 3 finished 0:21:13.524000
seq 4 finished 0:32:00.244000


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 5 finished 0:39:38.361000
seq 6 finished 0:43:08.366000
seq 7 finished 0:55:51.989000
seq 8 finished 0:57:19.913000
seq 9 finished 0:59:26.295000
seq 10 finished 1:12:30.209000
seq 11 finished 1:13:58.723000
seq 12 finished 1:17:46.274000
seq 13 finished 1:25:19.964000
seq 14 finished 1:37:03.121000
seq 15 finished 1:49:55.629000
seq 16 finished 2:13:22.447000
seq 17 finished 2:46:03.737000

n_seq_warmup = 1
7
AR 0.2957527642048221 -0.06178691026551264 0.902234109502832 0.8908904971680303
SMap 0.3051550378305814 -0.06178691026551264 0.9030522341095029 0.8908904971680303
MSLRX 0.4392045838782115 -0.06178691026551264 0.9280679672750157 0.8908904971680303
MXLRXSolu3 0.4367050285582871 -0.06178691026551264 0.9328508495909377 0.8908904971680303
MSLR 0.431953163413745 -0.06178691026551264 0.9296727501573316 0.8908904971680303
8
AR 0.290924217022421 -0.06583473014510366 0.9000593510946979 0.8889969664996044
SMap 0.30006072446337906 -0.06583473014510366 0.9009166446847797 0.888996966499604

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:02:03.258000
seq 2 finished 0:07:08.234001
seq 3 finished 0:08:41.296999
seq 4 finished 0:16:48.207173
seq 5 finished 0:24:35.600686
seq 6 finished 0:28:28.936686
seq 7 finished 0:41:04.166686
seq 8 finished 0:43:06.308686
seq 9 finished 0:45:54.196685
seq 10 finished 0:57:42.655687
seq 11 finished 1:00:00.655247
seq 12 finished 1:04:17.747335
seq 13 finished 1:11:53.366337
seq 14 finished 1:21:37.222336
seq 15 finished 1:34:03.534336
seq 16 finished 1:57:26.889248
seq 17 finished 2:27:32.762478

n_seq_warmup = 1
7
AR 0.6428416356015036 0.633636127426185 0.9691401984522623 0.9697217870198926
SMap 0.6439414239451153 0.633636127426185 0.9701134691164601 0.9697217870198926
MSLRX 0.630653471985925 0.633636127426185 0.9708256183829463 0.9697217870198926
MXLRXSolu3 0.6291410338079877 0.633636127426185 0.9696149646299197 0.9697217870198926
MSLR 0.635281559318485 0.633636127426185 0.9693063666144425 0.9697217870198926
8
AR 0.6452967318998177 0.6333535384376274 0.96940789473684

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:01:32.935546
seq 2 finished 0:05:39.561769
seq 3 finished 0:06:42.531388


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 4 finished 0:14:48.222488
seq 5 finished 0:23:01.537336
seq 6 finished 0:26:15.867567
seq 7 finished 0:36:56.145639
seq 8 finished 0:38:05.628639
seq 9 finished 0:39:54.521639
seq 10 finished 0:50:33.458748
seq 11 finished 0:51:49.299593
seq 12 finished 0:55:01.787421
seq 13 finished 1:01:54.446765
seq 14 finished 1:10:58.569509
seq 15 finished 1:22:18.287516
seq 16 finished 1:44:22.260978
seq 17 finished 2:11:34.277605

n_seq_warmup = 1
7
AR 0.28384796316012517 -0.06178691026551242 0.9139710509754563 0.8908904971680303
SMap 0.31678312108129447 -0.06178691026551242 0.9180616740088106 0.8908904971680303
MSLRX -239.41570353998503 -0.06178691026551242 0.9004405286343613 0.8908904971680303
MXLRXSolu3 -11.122546658983829 -0.06178691026551242 0.9348646947765891 0.8908904971680303
MSLR -67.78187930864418 -0.06178691026551242 0.9538703587161738 0.8908904971680303
8
AR 0.2779523685975741 -0.06583473014510388 0.9122592983381693 0.8889969664996044
SMap 0.3106633810964202 -0.06583473014510388 

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:02:02.119198
seq 2 finished 0:06:54.085254
seq 3 finished 0:08:25.195493
seq 4 finished 0:16:22.309335
seq 5 finished 0:23:58.041490
seq 6 finished 0:27:44.494418
seq 7 finished 0:40:31.232603
seq 8 finished 0:42:37.456289
seq 9 finished 0:45:26.351246
seq 10 finished 0:57:33.971172
seq 11 finished 0:59:51.658663
seq 12 finished 1:04:10.080142
seq 13 finished 1:10:43.990845
seq 14 finished 1:19:06.044528
seq 15 finished 1:29:34.996013
seq 16 finished 1:49:06.916080
seq 17 finished 2:13:41.434034

n_seq_warmup = 1
7
AR 0.6421149192929467 0.633636127426185 0.9683093576413616 0.9697217870198926
SMap 0.6486856735538533 0.633636127426185 0.969187675070028 0.9697217870198926
MSLRX 0.6461483168971942 0.633636127426185 0.9694962730855053 0.9697217870198926
MXLRXSolu3 0.6502886103383956 0.633636127426185 0.9692588899966766 0.9697217870198926
MSLR 0.6472635750831387 0.633636127426185 0.9687603855101363 0.9697217870198926
8
AR 0.6445582355925077 0.6333535384376274 0.9684463562753

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:01:40.206432
seq 2 finished 0:05:34.872838
seq 3 finished 0:06:46.085303
seq 4 finished 0:13:24.458293
seq 5 finished 0:19:42.647339
seq 6 finished 0:22:51.254167
seq 7 finished 0:33:14.098725
seq 8 finished 0:34:48.715586
seq 9 finished 0:36:58.751178
seq 10 finished 0:46:59.795171
seq 11 finished 0:48:41.938709
seq 12 finished 0:52:05.205034
seq 13 finished 0:58:23.868046
seq 14 finished 1:06:21.607900
seq 15 finished 1:16:27.258098
seq 16 finished 1:36:13.276958
seq 17 finished 2:00:34.670111

n_seq_warmup = 1
7
AR 0.46848534164461264 0.371146455940506 0.9304896064155324 0.9295135591431888
SMap 0.4814083731392609 0.371146455940506 0.9314920333438851 0.9295135591431888
MSLRX 0.5145116074777296 0.371146455940506 0.9412525060673209 0.9295135591431888
MXLRXSolu3 0.2998338982854828 0.371146455940506 0.8692096655059618 0.9295135591431888
MSLR 0.513812096455945 0.371146455940506 0.9390629946185501 0.9295135591431888
8
AR 0.47172362777121024 0.37320872374631264 0.9318566250

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:01:26.835964
seq 2 finished 0:05:03.089697
seq 3 finished 0:05:58.448615


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 4 finished 0:12:02.602527
seq 5 finished 0:17:40.530841
seq 6 finished 0:20:12.144820
seq 7 finished 0:29:12.410822
seq 8 finished 0:30:12.842508
seq 9 finished 0:31:49.027557
seq 10 finished 0:40:31.724221
seq 11 finished 0:41:35.385815
seq 12 finished 0:44:16.824731
seq 13 finished 0:49:48.853149
seq 14 finished 0:56:46.433931
seq 15 finished 1:05:14.652562
seq 16 finished 1:17:59.984047
seq 17 finished 1:34:22.305541

n_seq_warmup = 1
7
AR 0.22820996167935048 -0.06178691026551242 0.909062303335431 0.8908904971680303
SMap 0.26956001368896476 -0.06178691026551242 0.9070484581497797 0.8908904971680303
MSLRX -0.5239044355789166 -0.06178691026551242 0.9507551919446193 0.8908904971680303
MXLRXSolu3 0.3654044747032372 -0.06178691026551242 0.9258338577721837 0.8908904971680303
MSLR -0.2966403624342375 -0.06178691026551242 0.9475456261799875 0.8908904971680303
8
AR 0.22381752600988059 -0.06583473014510388 0.9077420205750462 0.8889969664996044
SMap 0.2651618206640258 -0.06583473014510388 

In [19]:

with open(r"res/res_vimsww_wind_pred_0207_2.csv", "w") as fw:
    
    head = ["is_log10", "is_cross", "is_phase", "horizon_forecast", "reg_method", "n_seq_warmup", "len_warmup", "r2_naive", "rocauc_naive", "r2_ar", "rocauc_ar", "r2_smap", 
            "rocauc_smap", "r2_mslrx", "rocauc_mslrx", "tmp_r2_mslrxsolu3", "tmp_rocauc_mslrxsolu3", "tmp_r2_mslr", "tmp_rocauc_mslr", ]
    fw.write(",".join(head) + "\n")

    for tmp_setting in dict_setting_results:

        is_log, is_cross, is_phase, horizon_forecast, reg_method = tmp_setting
        dict_warmup_evaluation = dict_setting_results[tmp_setting]

        for n_seq_warmup in dict_warmup_evaluation:
            
            for cc in range(len(dict_warmup_evaluation[n_seq_warmup][0])):
                
                len_warmup = cc + 7
                tmp_r2_naive, tmp_rocauc_naive, tmp_r2_ar, tmp_rocauc_ar, tmp_r2_smap, tmp_rocauc_smap, tmp_r2_mslrx, tmp_rocauc_mslrx, tmp_r2_mslrxsolu3, tmp_rocauc_mslrxsolu3, tmp_r2_mslr, tmp_rocauc_mslr = dict_warmup_evaluation[n_seq_warmup][0][cc]
            
                line = [is_log, is_cross, is_phase, horizon_forecast, reg_method, n_seq_warmup, len_warmup, tmp_r2_naive, tmp_rocauc_naive, tmp_r2_ar, tmp_rocauc_ar, tmp_r2_smap, tmp_rocauc_smap, tmp_r2_mslrx, tmp_rocauc_mslrx, tmp_r2_mslrxsolu3, tmp_rocauc_mslrxsolu3, tmp_r2_mslr, tmp_rocauc_mslr]
                fw.write(",".join([str(i) for i in line]) + "\n")

    

## Predictions (W)

In [13]:
data_W_byday = data_W[["date"]].resample("1D").max()
data_W_byday["temperature"] = data_W["temperature"].resample("1D").mean()
data_W_byday["airpressure"] = data_W["airpressure"].resample("1D").mean()
data_W_byday["wind"] = data_W["wind"].resample("1D").mean()
data_W_byday["waterlevel"] = data_W["waterlevel"].resample("1D").mean()
data_W_byday["ODO"] = data_W["ODO"].resample("1D").min()
data_W_byday["salinity_max"] = data_W["salinity"].resample("1D").max()
data_W_byday["salinity_min"] = data_W["salinity"].resample("1D").min()
data_W_byday["log10_chlorophyll"] = data_W["log10_chlorophyll"].resample("1D").max()
data_W_byday["chlorophyll"] = data_W["chlorophyll"].resample("1D").max()

# data_W_byday = data_W[["temperature", "pH"]].resample("1D").mean()
# data_W_byday["date"] = data_W["date"].resample("1D").max()
# data_W_byday["conductivity"] = data_W["conductivity"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_W_byday["turbidity"] = data_W["turbidity"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_W_byday["ODO"] = data_W["ODO"].resample("1D").apply(lambda x: x.nsmallest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_W_byday["salinity_max"] = data_W["salinity"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_W_byday["salinity_min"] = data_W["salinity"].resample("1D").apply(lambda x: x.nsmallest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_W_byday["log10_chlorophyll"] = data_W["log10_chlorophyll"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_W_byday["chlorophyll"] = data_W["chlorophyll"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)


In [14]:
def _main(data_WW_byday, is_log = True, is_cross = True, is_phase = True, horizon_forecast = 1, thres_quantile = 0.95, n_seq_warmup = 6,
          reg_method = "LinearSVR", reg_method_func = LinearSVR, kargs_reg = {"random_state": 434},
          #reg_method = "LinearRegression", reg_method_func = LinearRegression, kargs_reg = {},
         ):
    
    # Data Preparation
    if is_log:
        array_obs = data_WW_byday["log10_chlorophyll"].to_numpy()
    else:
        array_obs = data_WW_byday["chlorophyll"].to_numpy()

    array_datetime = data_WW_byday["date"].to_numpy()
    #array_cross = data_WW_byday[["salinity_max", "salinity_min", "ODO", "turbidity", "pH", "temperature"]].to_numpy()
    #array_cross = data_WW_byday[["temperature", "airpressure", "wind", "waterlevel", "ODO", "salinity_max", "salinity_min"]].to_numpy()
    array_cross = data_WW_byday[["temperature", "ODO", "salinity_max", "salinity_min"]].to_numpy()
    
    list_split_index = list(np.where(np.isnan(array_obs))[0])
    list_split_index.append(len(array_obs))
    list_split_index.insert(0, 0)
    list_subseq, list_subseq_cross,list_subseq_datetime = [], [], []
    for tmp_index in range(len(list_split_index) - 1):
        tmp_index_left = list_split_index[tmp_index] + 1
        tmp_index_right = list_split_index[tmp_index + 1]
        if tmp_index_right - tmp_index_left < 21:
            continue
        list_subseq.append(array_obs[tmp_index_left:tmp_index_right])
        list_subseq_datetime.append(array_datetime[tmp_index_left:tmp_index_right])
        list_subseq_cross.append(array_cross[tmp_index_left:tmp_index_right])
    
    list_p_AR = [2, 1, ]
    list_p_AR_cross = [1, ]
    list_p_AR.sort(reverse = True)
    max_p = max(max(list_p_AR), max(list_p_AR_cross) if len(list_p_AR_cross) != 0 else 0)

    list_X, list_Y, list_exog = [], [], []
    list_date = []
    for tmp_subseq, tmp_subseq_datetime, tmp_subseq_cross in zip(list_subseq, list_subseq_datetime, list_subseq_cross):
        X, Y, exog = [], [], []
        tmp_list_date = []
        for tmp_index_right in range(max_p, len(tmp_subseq) - horizon_forecast + 1):

            tmp_index_test = tmp_index_right - 1 + horizon_forecast # tmp_index_right == tmp_index_test if horizon_forecast = 1

            tmp_X = []
            if is_cross:
                for tmp_p in list_p_AR_cross:
                    tmp_X.extend(tmp_subseq_cross[tmp_index_right - tmp_p])
            for tmp_p in list_p_AR:
                tmp_X.append(tmp_subseq[tmp_index_right - tmp_p])
            X.append(tmp_X)
            Y.append([tmp_subseq[tmp_index_test]])

            tmp_date= tmp_subseq_datetime[tmp_index_test]
            tmp_list_date.append(tmp_date)
            
            tmp_t = pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).toordinal() - pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).replace(month = 1, day = 1).toordinal()
            if is_phase:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi), np.sin(tmp_t / 365 * 2 * np.pi)]
            else:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi)]
            exog.append(tmp_exog)

        list_X.append(X)
        list_Y.append(Y)
        list_exog.append(exog)
        list_date.append(tmp_list_date)
    
    print("Subseqs:")
    for tmp_index in range(len(list_subseq)):
        print("\t", tmp_index, pd.Timestamp(list_subseq_datetime[tmp_index][1]).date(), pd.Timestamp(list_subseq_datetime[tmp_index][-1]).date(), len(list_subseq[tmp_index]))
    print()
    
    # Prediction
    
    start_time = datetime.datetime.now()

    list_y_true, list_y_pred_naive = [], []
    list_y_pred_mslrx, list_y_pred_mslr = [], []
    list_y_pred_mslrxsolu3 = []
    list_y_date = []
    for cc in range(n_seq_warmup, len(list_X)):
        tmp_list_y_true, tmp_list_y_pred_naive = [], []
        tmp_list_y_pred_mslrx, tmp_list_y_pred_mslr = [], []
        tmp_list_y_pred_mslrxsolu3 = []
        tmp_list_y_date = []
        for tmp_index_test in range(5, len(list_X[cc])):
            tmp_list_X, tmp_list_Y, tmp_list_exog = [], [], []
            tmp_list_X_test = []
            for ii in range(cc):
                tmp_list_X.append(copy.deepcopy(list_X[ii]))
                tmp_list_Y.append(copy.deepcopy(list_Y[ii]))
                tmp_list_exog.append(copy.deepcopy(list_exog[ii]))
                tmp_list_X_test.append(copy.deepcopy(list_X[ii][-1]))
            tmp_list_X.append(copy.deepcopy(list_X[cc][:tmp_index_test]))
            tmp_list_Y.append(copy.deepcopy(list_Y[cc][:tmp_index_test]))
            tmp_list_exog.append(copy.deepcopy(list_exog[cc][:tmp_index_test]))
            tmp_list_X_test.append(copy.deepcopy(list_X[cc][tmp_index_test]))

            mslrx = MSLRX(n_components = 2, covariance_type="diag", n_iter = 10, is_logistic_regression_CV = False,
                          logistic_regression_C = 1e10, is_logistic_regression_standardized = False, reg_method = reg_method, kargs_reg = kargs_reg)
            tmp_pred = mslrx.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_exog, tmp_list_X_test, is_multiple_sequence = True)
            tmp_y_pred = tmp_pred[-1]
            tmp_list_y_pred_mslrx.append(tmp_y_pred)

            mslrx = MSLRXSoluIII(n_components = 2, covariance_type="diag", n_iter = 10, reg_method = reg_method, kargs_reg = kargs_reg)
            tmp_pred = mslrx.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_exog, tmp_list_X_test, is_multiple_sequence = True)
            tmp_y_pred = tmp_pred[-1]
            tmp_list_y_pred_mslrxsolu3.append(tmp_y_pred)

            mslr = MSLR(n_components = 2, covariance_type="diag", n_iter = 10, reg_method = reg_method, kargs_reg = kargs_reg)
            tmp_pred = mslr.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_X_test, is_multiple_sequence = True)
            tmp_y_pred = tmp_pred[-1]
            tmp_list_y_pred_mslr.append(tmp_y_pred)

            tmp_y_true = list_Y[cc][tmp_index_test]
            #tmp_y_pred_naive = list_Y[cc][tmp_index_test - 1]
            tmp_y_pred_naive = list_X[cc][tmp_index_test][-1]
            tmp_list_y_true.append(tmp_y_true)
            tmp_list_y_pred_naive.append(tmp_y_pred_naive)
            tmp_list_y_date.append(list_date[cc][tmp_index_test])

        list_y_true.append(tmp_list_y_true)
        list_y_pred_naive.append(tmp_list_y_pred_naive)
        list_y_pred_mslrx.append(tmp_list_y_pred_mslrx)
        list_y_pred_mslr.append(tmp_list_y_pred_mslr)
        list_y_pred_mslrxsolu3.append(tmp_list_y_pred_mslrxsolu3)
        list_y_date.append(tmp_list_y_date)

        print("seq % s finished" % cc, str(datetime.datetime.now() - start_time))

        #if cc > 5: break
        
    if is_log:
        thres = np.nanquantile(np.power(10, array_obs), thres_quantile)
    else:
        thres = np.nanquantile(array_obs, thres_quantile)

    list_p_AR = [2, 1, ]
    list_p_AR_cross = [1, ]
    list_p_AR.sort(reverse = True)
    
    if is_cross:
        max_p = max(max(list_p_AR), max(list_p_AR_cross) if len(list_p_AR_cross) != 0 else 0)
    else:
        max_p = max(list_p_AR)
    
    dict_warmup_evaluation = dict()
    for n_seq_exclude_valid in range(n_seq_warmup, len(list_subseq) - 2):
        
        print()
        print("n_seq_warmup =", n_seq_exclude_valid)
        
        list_res_logitcos_cross = []
        list_res_pred = []
        for start_index in range(7, 20):

            print(start_index)

            X, Y = [], []
            list_index_test_set = []
            list_date_test = []
            for cc, (tmp_subseq, tmp_subseq_datetime, tmp_subseq_cross) in enumerate(zip(list_subseq, list_subseq_datetime, list_subseq_cross)):
                for tmp_index_right in range(max_p, len(tmp_subseq) - horizon_forecast + 1):
                    tmp_index_test = tmp_index_right - 1 + horizon_forecast # tmp_index_right == tmp_index_test if horizon_forecast = 1
                    tmp_X = []
                    if is_cross:
                        for tmp_p in list_p_AR_cross:
                            tmp_X.extend(tmp_subseq_cross[tmp_index_right - tmp_p])
                    for tmp_p in list_p_AR:
                        tmp_X.append(tmp_subseq[tmp_index_right - tmp_p])
                    X.append(tmp_X)
                    Y.append([tmp_subseq[tmp_index_test]])

                    if cc >= n_seq_exclude_valid and tmp_index_right >= start_index:
                        list_index_test_set.append(len(Y) - 1)
                        list_date_test.append(tmp_subseq_datetime[tmp_index_test])

            y_true, y_pred = [], []
            y_pred_naive = []
            for tmp_index in list_index_test_set:
                lr = reg_method_func(**kargs_reg)
                #lr = SMap(theta = 0.5, reg_method = reg_method, kargs_reg = kargs_reg)
                #lr = SMap(theta = 0.5)
                lr.fit(X[:tmp_index], Y[:tmp_index])
                tmp_y_pred = lr.predict([X[tmp_index]])[0]
                tmp_y_true = Y[tmp_index]
                tmp_y_pred_naive = X[tmp_index][-1]
                y_true.append(tmp_y_true)
                y_pred.append(tmp_y_pred)
                y_pred_naive.append(tmp_y_pred_naive)

            if is_log:
                y_true = np.power(10, y_true)
                y_pred_naive = np.power(10, y_pred_naive)
                y_pred = np.power(10, y_pred)

            tmp_r2_ar, tmp_r2_naive = r2_score(y_true = y_true, y_pred = y_pred), r2_score(y_true = y_true, y_pred = y_pred_naive)
            tmp_rocauc_ar, tmp_rocauc_naive = roc_auc_score(y_true = np.array(y_true) > thres, y_score=y_pred), roc_auc_score(y_true = np.array(y_true) > thres, y_score=y_pred_naive)
            y_pred_ar = y_pred
            print("AR", tmp_r2_ar, tmp_r2_naive, tmp_rocauc_ar, tmp_rocauc_naive)
            
            y_true, y_pred = [], []
            y_pred_naive = []
            for tmp_index in list_index_test_set:
                #lr = reg_method_func(**kargs_reg)
                lr = SMap(theta = 0.5, reg_method = reg_method, kargs_reg = kargs_reg)
                #lr = SMap(theta = 0.5)
                lr.fit(X[:tmp_index], Y[:tmp_index])
                tmp_y_pred = lr.predict([X[tmp_index]])[0]
                tmp_y_true = Y[tmp_index]
                tmp_y_pred_naive = X[tmp_index][-1]
                y_true.append(tmp_y_true)
                y_pred.append(tmp_y_pred)
                y_pred_naive.append(tmp_y_pred_naive)

            if is_log:
                y_true = np.power(10, y_true)
                y_pred_naive = np.power(10, y_pred_naive)
                y_pred = np.power(10, y_pred)

            tmp_r2_smap, tmp_r2_naive = r2_score(y_true = y_true, y_pred = y_pred), r2_score(y_true = y_true, y_pred = y_pred_naive)
            tmp_rocauc_smap, tmp_rocauc_naive = roc_auc_score(y_true = np.array(y_true) > thres, y_score=y_pred), roc_auc_score(y_true = np.array(y_true) > thres, y_score=y_pred_naive)
            y_pred_smap = y_pred
            print("SMap", tmp_r2_smap, tmp_r2_naive, tmp_rocauc_smap, tmp_rocauc_naive)

            y_true, y_pred = [], []
            y_pred_naive = []
            for cc in range(n_seq_exclude_valid - n_seq_warmup, len(list_y_true)):
                for tmp_index_test in range(start_index - 5 - max_p, len(list_y_true[cc])):
                    y_true.append(list_y_true[cc][tmp_index_test])
                    y_pred.append(list_y_pred_mslrx[cc][tmp_index_test])
                    y_pred_naive.append(list_y_pred_naive[cc][tmp_index_test])

            if is_log:
                y_true = np.power(10, y_true)
                y_pred_naive = np.power(10, y_pred_naive)
                y_pred = np.power(10, y_pred)

            tmp_r2_mslrx, tmp_r2_naive = r2_score(y_true = y_true, y_pred = y_pred), r2_score(y_true = y_true, y_pred = y_pred_naive)
            tmp_rocauc_mslrx, tmp_rocauc_naive = roc_auc_score(y_true = np.array(y_true) > thres, y_score=y_pred), roc_auc_score(y_true = np.array(y_true) > thres, y_score=y_pred_naive)
            y_pred_mslrx = y_pred
            print("MSLRX", tmp_r2_mslrx, tmp_r2_naive, tmp_rocauc_mslrx, tmp_rocauc_naive)

            y_true, y_pred = [], []
            y_pred_naive = []
            for cc in range(n_seq_exclude_valid - n_seq_warmup, len(list_y_true)):
                for tmp_index_test in range(start_index - 5 - max_p, len(list_y_true[cc])):
                    y_true.append(list_y_true[cc][tmp_index_test])
                    y_pred.append(list_y_pred_mslrxsolu3[cc][tmp_index_test])
                    y_pred_naive.append(list_y_pred_naive[cc][tmp_index_test])

            if is_log:
                y_true = np.power(10, y_true)
                y_pred_naive = np.power(10, y_pred_naive)
                y_pred = np.power(10, y_pred)

            tmp_r2_mslrxsolu3, tmp_r2_naive = r2_score(y_true = y_true, y_pred = y_pred), r2_score(y_true = y_true, y_pred = y_pred_naive)
            tmp_rocauc_mslrxsolu3, tmp_rocauc_naive = roc_auc_score(y_true = np.array(y_true) > thres, y_score=y_pred), roc_auc_score(y_true = np.array(y_true) > thres, y_score=y_pred_naive)
            y_pred_mslrxsolu3 = y_pred
            print("MXLRXSolu3", tmp_r2_mslrxsolu3, tmp_r2_naive, tmp_rocauc_mslrxsolu3, tmp_rocauc_naive)

            y_true, y_pred = [], []
            y_pred_naive = []
            for cc in range(n_seq_exclude_valid - n_seq_warmup, len(list_y_true)):
                for tmp_index_test in range(start_index - 5 - max_p, len(list_y_true[cc])):
                    y_true.append(list_y_true[cc][tmp_index_test])
                    y_pred.append(list_y_pred_mslr[cc][tmp_index_test])
                    y_pred_naive.append(list_y_pred_naive[cc][tmp_index_test])

            if is_log:
                y_true = np.power(10, y_true)
                y_pred_naive = np.power(10, y_pred_naive)
                y_pred = np.power(10, y_pred)

            tmp_r2_mslr, tmp_r2_naive = r2_score(y_true = y_true, y_pred = y_pred), r2_score(y_true = y_true, y_pred = y_pred_naive)
            tmp_rocauc_mslr, tmp_rocauc_naive = roc_auc_score(y_true = np.array(y_true) > thres, y_score=y_pred), roc_auc_score(y_true = np.array(y_true) > thres, y_score=y_pred_naive)
            y_pred_mslr = y_pred
            print("MSLR", tmp_r2_mslr, tmp_r2_naive, tmp_rocauc_mslr, tmp_rocauc_naive)

            list_res_logitcos_cross.append((tmp_r2_naive, tmp_rocauc_naive,
                                            tmp_r2_ar, tmp_rocauc_ar, 
                                            tmp_r2_smap, tmp_rocauc_smap,
                                            tmp_r2_mslrx, tmp_rocauc_mslrx, 
                                            tmp_r2_mslrxsolu3, tmp_rocauc_mslrxsolu3, 
                                            tmp_r2_mslr, tmp_rocauc_mslr,
                                           ))
            
            list_res_pred.append((list_date_test, y_true, y_pred_naive, y_pred_ar, y_pred_smap, y_pred_mslrx, y_pred_mslrxsolu3, y_pred_mslr))
            
        dict_warmup_evaluation[n_seq_exclude_valid] = (list_res_logitcos_cross, list_res_pred)
                
    return dict_warmup_evaluation


In [16]:

dict_setting_results = dict()
for is_log in [True]:
    for is_cross in [True, False]:
        for is_phase in [True]:
            for horizon_forecast in [1, 3, 7]:
                
                tmp_setting = (is_log, is_cross, is_phase, horizon_forecast, "LinearRegression", )
                
                print()
                print("================================================")
                print("setting =", tmp_setting)
                print("================================================")
                
                dict_setting_results[tmp_setting] = _main(data_W_byday, n_seq_warmup=1, 
                                                          thres_quantile = 0.9, 
                                                          #reg_method = "LinearSVR", reg_method_func = LinearSVR, kargs_reg = {"random_state": 434},
                                                          reg_method = "LinearRegression", reg_method_func = LinearRegression, kargs_reg = {},
                                                          is_log = is_log, is_cross = is_cross, is_phase = is_phase, horizon_forecast = horizon_forecast,)
                 
                



setting = (True, True, True, 1, 'LinearRegression')
Subseqs:
	 0 2016-04-19 2016-05-10 23
	 1 2016-06-07 2016-06-30 25
	 2 2016-07-19 2016-08-13 27
	 3 2016-08-31 2016-11-23 86
	 4 2017-04-06 2017-05-11 37
	 5 2017-06-16 2017-07-13 29
	 6 2017-07-19 2017-08-08 22
	 7 2017-09-13 2017-10-19 38
	 8 2017-11-01 2017-12-10 41
	 9 2018-01-26 2018-02-22 29
	 10 2018-04-10 2018-06-15 68
	 11 2019-02-05 2019-03-12 37
	 12 2019-03-21 2019-04-17 29
	 13 2019-04-25 2019-05-21 28
	 14 2019-05-24 2019-07-25 64
	 15 2019-08-01 2019-09-02 34
	 16 2019-09-06 2019-09-27 23
	 17 2020-04-15 2020-05-04 21
	 18 2020-12-12 2021-01-27 48
	 19 2021-03-26 2021-08-05 134
	 20 2021-08-13 2021-12-01 112
	 21 2021-12-16 2022-10-23 313
	 22 2022-10-26 2022-12-31 68



  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:00:21.921997


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \


seq 2 finished 0:00:51.542996
seq 3 finished 0:03:36.897997
seq 4 finished 0:04:53.447998
seq 5 finished 0:05:56.035000
seq 6 finished 0:06:41.028999
seq 7 finished 0:08:19.446997
seq 8 finished 0:10:19.451998
seq 9 finished 0:11:47.001998
seq 10 finished 0:16:14.126996
seq 11 finished 0:18:29.387995
seq 12 finished 0:20:15.070998
seq 13 finished 0:22:00.795995
seq 14 finished 0:27:13.206997
seq 15 finished 0:29:52.122997
seq 16 finished 0:31:30.794999
seq 17 finished 0:33:00.456998
seq 18 finished 0:37:28.658996
seq 19 finished 0:51:28.981996
seq 20 finished 1:03:13.048998
seq 21 finished 1:47:32.621985
seq 22 finished 1:59:26.156980

n_seq_warmup = 1
7
AR 0.7870959958234819 0.7776165948741907 0.9503428162095784 0.9535577841451767
SMap 0.7900476942441309 0.7776165948741907 0.9513576204120616 0.9535577841451767
MSLRX 0.7806111075241985 0.7776165948741907 0.9481255969436485 0.9535577841451767
MXLRXSolu3 0.784075215616787 0.7776165948741907 0.949788511393096 0.9535577841451767
MSLR 0.787

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:00:15.785001


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 2 finished 0:00:44.250000


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 3 finished 0:03:29.523001
seq 4 finished 0:04:46.498000
seq 5 finished 0:05:44.750005
seq 6 finished 0:06:25.694001
seq 7 finished 0:08:13.049002
seq 8 finished 0:10:18.735000
seq 9 finished 0:11:42.265001
seq 10 finished 0:16:12.429002
seq 11 finished 0:18:25.859001
seq 12 finished 0:20:08.209001
seq 13 finished 0:21:50.792002
seq 14 finished 0:27:16.656001
seq 15 finished 0:30:00.830000
seq 16 finished 0:31:36.909000
seq 17 finished 0:33:01.358001
seq 18 finished 0:37:41.098001
seq 19 finished 0:53:55.953001
seq 20 finished 1:08:43.345001
seq 21 finished 1:58:49.443001
seq 22 finished 2:09:01.964002

n_seq_warmup = 1
7
AR 0.47682707921054235 0.39639830601669324 0.8737756944571106 0.8779889470516352
SMap 0.4915795880396885 0.39639830601669324 0.8759826362923377 0.8779889470516352
MSLRX 0.5939206083252333 0.39639830601669324 0.904198661243548 0.8779889470516352
MXLRXSolu3 0.5567205532183226 0.39639830601669324 0.8981067722107721 0.8779889470516352
MSLR 0.5921535905024737 0.39639830

  tmp_log_prob.append(logsumexp([np.log(self.transmat_[ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(self.transmat_[ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(self.transmat_[ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(self.transmat_[ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(self.transmat_[ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(self.transmat_[ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(self.transmat_[ii, j]) + list_cur_ma

seq 1 finished 0:00:09.311986
seq 2 finished 0:00:23.949985


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 3 finished 0:02:46.015986
seq 4 finished 0:03:44.988985
seq 5 finished 0:04:27.099986
seq 6 finished 0:04:51.690985
seq 7 finished 0:06:05.908986
seq 8 finished 0:07:44.661986
seq 9 finished 0:08:45.324986
seq 10 finished 0:12:22.208986
seq 11 finished 0:13:59.312985
seq 12 finished 0:15:07.726986
seq 13 finished 0:16:15.168985
seq 14 finished 0:20:29.866987
seq 15 finished 0:22:26.099985
seq 16 finished 0:23:21.923985
seq 17 finished 0:24:08.829986
seq 18 finished 0:27:38.917987
seq 19 finished 0:41:24.128985
seq 20 finished 0:53:21.492987
seq 21 finished 1:32:14.172017
seq 22 finished 1:39:57.381986

n_seq_warmup = 1
7
AR 0.1919577604469357 -0.10253890850258252 0.7655917049273961 0.758781661211309
SMap 0.22205681157888102 -0.10253890850258252 0.7729070099514509 0.758781661211309
MSLRX -0.11473733318389279 -0.10253890850258252 0.8824828101315877 0.758781661211309
MXLRXSolu3 -0.10316949990558855 -0.10253890850258252 0.8733112189978252 0.758781661211309
MSLR 0.4702747750500931 -0.10

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:00:18.384003


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 2 finished 0:00:41.209002
seq 3 finished 0:02:57.744999
seq 4 finished 0:04:01.178002
seq 5 finished 0:04:54.544003
seq 6 finished 0:05:32.996001
seq 7 finished 0:06:55.224000
seq 8 finished 0:08:37.990999
seq 9 finished 0:09:43.630001
seq 10 finished 0:12:59.216000
seq 11 finished 0:14:46.103000
seq 12 finished 0:16:10.057001
seq 13 finished 0:17:33.501002
seq 14 finished 0:21:40.910000
seq 15 finished 0:23:45.186001
seq 16 finished 0:25:01.388000
seq 17 finished 0:26:10.975001
seq 18 finished 0:29:43.555000
seq 19 finished 0:43:17.018001
seq 20 finished 0:57:41.221115
seq 21 finished 1:46:38.125043
seq 22 finished 1:58:02.471601

n_seq_warmup = 1
7
AR 0.7883652886513541 0.7776165948741907 0.9533531177513985 0.9535577841451767
SMap 0.7896973120309798 0.7776165948741907 0.9533872288170282 0.9535577841451767
MSLRX 0.7889441355078225 0.7776165948741907 0.9528670350661755 0.9535577841451767
MXLRXSolu3 0.7875324069507019 0.7776165948741907 0.9530120070951017 0.9535577841451767
MSLR 0.7

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:00:14.705000


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 2 finished 0:00:39.133025


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 3 finished 0:03:18.254999
seq 4 finished 0:04:33.955002
seq 5 finished 0:05:28.041000
seq 6 finished 0:06:05.845001
seq 7 finished 0:07:43.262001
seq 8 finished 0:09:39.503000
seq 9 finished 0:10:52.611000
seq 10 finished 0:14:58.456039
seq 11 finished 0:16:57.268038
seq 12 finished 0:18:28.020038
seq 13 finished 0:19:59.563039
seq 14 finished 0:24:55.948038
seq 15 finished 0:27:22.340038
seq 16 finished 0:28:47.708038
seq 17 finished 0:30:02.659038
seq 18 finished 0:34:10.155039
seq 19 finished 0:48:56.316040
seq 20 finished 1:01:42.673038
seq 21 finished 1:47:09.300768
seq 22 finished 1:57:08.921435

n_seq_warmup = 1
7
AR 0.4782670657011978 0.39639830601669324 0.8770222700494282 0.8779889470516352
SMap 0.4914917215354184 0.39639830601669324 0.8777700767869845 0.8779889470516352
MSLRX 0.5926933769238388 0.39639830601669324 0.903852116657851 0.8779889470516352
MXLRXSolu3 0.5220747784090163 0.39639830601669324 0.8818100570886606 0.8779889470516352
MSLR 0.5983024271964781 0.396398306

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:00:10.797628
seq 2 finished 0:00:28.597049


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 3 finished 0:02:52.574716


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 4 finished 0:03:53.171705
seq 5 finished 0:04:39.043338
seq 6 finished 0:05:04.705394
seq 7 finished 0:06:14.694873
seq 8 finished 0:07:49.776936
seq 9 finished 0:08:40.203736
seq 10 finished 0:12:02.376944
seq 11 finished 0:13:31.771749
seq 12 finished 0:14:35.818195
seq 13 finished 0:15:42.525165
seq 14 finished 0:20:01.339942
seq 15 finished 0:21:57.237001
seq 16 finished 0:23:00.705699
seq 17 finished 0:23:47.741931
seq 18 finished 0:27:24.723570
seq 19 finished 0:43:37.692267
seq 20 finished 0:58:42.348487
seq 21 finished 1:55:00.405837
seq 22 finished 2:06:19.057385

n_seq_warmup = 1
7
AR 0.17695744482022646 -0.10253890850258252 0.7609454976823883 0.758781661211309
SMap 0.20403030600244165 -0.10253890850258252 0.7637573867006436 0.758781661211309
MSLRX 0.4829751898790692 -0.10253890850258252 0.8575382790360493 0.758781661211309
MXLRXSolu3 0.41361360131154334 -0.10253890850258252 0.8363832077502691 0.758781661211309
MSLR 0.4816416072535995 -0.10253890850258252 0.85976801915599

In [17]:

with open(r"res/res_vimsw_pred_0205_2.csv", "w") as fw:
    
    head = ["is_log10", "is_cross", "is_phase", "horizon_forecast", "reg_method", "n_seq_warmup", "len_warmup", "r2_naive", "rocauc_naive", "r2_ar", "rocauc_ar", "r2_smap", 
            "rocauc_smap", "r2_mslrx", "rocauc_mslrx", "tmp_r2_mslrxsolu3", "tmp_rocauc_mslrxsolu3", "tmp_r2_mslr", "tmp_rocauc_mslr", ]
    fw.write(",".join(head) + "\n")

    for tmp_setting in dict_setting_results:

        is_log, is_cross, is_phase, horizon_forecast, reg_method = tmp_setting
        dict_warmup_evaluation = dict_setting_results[tmp_setting]

        for n_seq_warmup in dict_warmup_evaluation:
            
            for cc in range(len(dict_warmup_evaluation[n_seq_warmup][0])):
                
                len_warmup = cc + 7
                tmp_r2_naive, tmp_rocauc_naive, tmp_r2_ar, tmp_rocauc_ar, tmp_r2_smap, tmp_rocauc_smap, tmp_r2_mslrx, tmp_rocauc_mslrx, tmp_r2_mslrxsolu3, tmp_rocauc_mslrxsolu3, tmp_r2_mslr, tmp_rocauc_mslr = dict_warmup_evaluation[n_seq_warmup][0][cc]
            
                line = [is_log, is_cross, is_phase, horizon_forecast, reg_method, n_seq_warmup, len_warmup, tmp_r2_naive, tmp_rocauc_naive, tmp_r2_ar, tmp_rocauc_ar, tmp_r2_smap, tmp_rocauc_smap, tmp_r2_mslrx, tmp_rocauc_mslrx, tmp_r2_mslrxsolu3, tmp_rocauc_mslrxsolu3, tmp_r2_mslr, tmp_rocauc_mslr]
                fw.write(",".join([str(i) for i in line]) + "\n")

    

In [18]:

dict_setting_results = dict()
for is_log in [False]:
    for is_cross in [True, False]:
        for is_phase in [True]:
            for horizon_forecast in [1, 3, 7]:
                
                tmp_setting = (is_log, is_cross, is_phase, horizon_forecast, "LinearRegression", )
                
                print()
                print("================================================")
                print("setting =", tmp_setting)
                print("================================================")
                
                dict_setting_results[tmp_setting] = _main(data_W_byday, n_seq_warmup=1, 
                                                          thres_quantile = 0.9, 
                                                          #reg_method = "LinearSVR", reg_method_func = LinearSVR, kargs_reg = {"random_state": 434},
                                                          reg_method = "LinearRegression", reg_method_func = LinearRegression, kargs_reg = {},
                                                          is_log = is_log, is_cross = is_cross, is_phase = is_phase, horizon_forecast = horizon_forecast,)
                 
                



setting = (False, True, True, 1, 'LinearRegression')
Subseqs:
	 0 2016-04-19 2016-05-10 23
	 1 2016-06-07 2016-06-30 25
	 2 2016-07-19 2016-08-13 27
	 3 2016-08-31 2016-11-23 86
	 4 2017-04-06 2017-05-11 37
	 5 2017-06-16 2017-07-13 29
	 6 2017-07-19 2017-08-08 22
	 7 2017-09-13 2017-10-19 38
	 8 2017-11-01 2017-12-10 41
	 9 2018-01-26 2018-02-22 29
	 10 2018-04-10 2018-06-15 68
	 11 2019-02-05 2019-03-12 37
	 12 2019-03-21 2019-04-17 29
	 13 2019-04-25 2019-05-21 28
	 14 2019-05-24 2019-07-25 64
	 15 2019-08-01 2019-09-02 34
	 16 2019-09-06 2019-09-27 23
	 17 2020-04-15 2020-05-04 21
	 18 2020-12-12 2021-01-27 48
	 19 2021-03-26 2021-08-05 134
	 20 2021-08-13 2021-12-01 112
	 21 2021-12-16 2022-10-23 313
	 22 2022-10-26 2022-12-31 68



  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:00:19.529547


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \


seq 2 finished 0:00:51.412739


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 3 finished 0:04:10.215682
seq 4 finished 0:05:51.155065
seq 5 finished 0:07:12.820295
seq 6 finished 0:08:13.229013
seq 7 finished 0:10:28.111115
seq 8 finished 0:13:08.693076
seq 9 finished 0:14:56.215724
seq 10 finished 0:20:20.913370
seq 11 finished 0:23:15.464419
seq 12 finished 0:25:29.738855
seq 13 finished 0:27:43.007476
seq 14 finished 0:34:20.489015
seq 15 finished 0:37:42.695279
seq 16 finished 0:39:46.339654
seq 17 finished 0:41:36.947128
seq 18 finished 0:47:27.689768
seq 19 finished 1:07:51.185543
seq 20 finished 1:26:41.282549
seq 21 finished 2:27:41.864493
seq 22 finished 2:40:42.380099

n_seq_warmup = 1
7
AR 0.7752481114572535 0.7776165948741907 0.9484411243007231 0.9535577841451767
SMap 0.7822071195610025 0.7776165948741907 0.9500784554509483 0.9535577841451767
MSLRX 0.774979507337267 0.7776165948741907 0.947144903806795 0.9535577841451767
MXLRXSolu3 0.7698244738824872 0.7776165948741907 0.9456781279847183 0.9535577841451767
MSLR 0.7694313113513583 0.77761659487419

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:00:17.305817


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 2 finished 0:00:46.213040


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 3 finished 0:03:55.557041
seq 4 finished 0:05:26.916580
seq 5 finished 0:06:38.408615
seq 6 finished 0:07:27.952037
seq 7 finished 0:09:30.034401
seq 8 finished 0:11:57.023880
seq 9 finished 0:13:32.362607
seq 10 finished 0:18:38.164070
seq 11 finished 0:21:19.650706
seq 12 finished 0:23:19.241337
seq 13 finished 0:25:17.596511
seq 14 finished 0:31:30.461543
seq 15 finished 0:34:21.800851
seq 16 finished 0:36:00.851122
seq 17 finished 0:37:27.807263
seq 18 finished 0:42:21.223724
seq 19 finished 0:59:24.724940
seq 20 finished 1:16:01.190022
seq 21 finished 2:16:42.959057
seq 22 finished 2:29:16.672406

n_seq_warmup = 1
7
AR 0.46232746582795126 0.39639830601669324 0.8648111332007952 0.8779889470516352
SMap 0.49816470094075593 0.39639830601669324 0.8726904627282178 0.8779889470516352
MSLRX 0.5438157213556774 0.39639830601669324 0.8936381709741551 0.8779889470516352
MXLRXSolu3 0.5015631579057946 0.39639830601669324 0.8785543619019826 0.8779889470516352
MSLR 0.5680271571345867 0.396398

  tmp_log_prob.append(logsumexp([np.log(self.transmat_[ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(self.transmat_[ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(self.transmat_[ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(self.transmat_[ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(self.transmat_[ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(self.transmat_[ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(self.transmat_[ii, j]) + list_cur_ma

seq 1 finished 0:00:10.503425


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 2 finished 0:00:29.628650


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 3 finished 0:03:27.355229


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 4 finished 0:04:41.975770


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 5 finished 0:05:39.928318


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 6 finished 0:06:13.915506
seq 7 finished 0:07:56.211450
seq 8 finished 0:09:56.618558
seq 9 finished 0:11:11.819522
seq 10 finished 0:15:54.177132
seq 11 finished 0:18:05.138413
seq 12 finished 0:19:36.456248
seq 13 finished 0:21:05.678651
seq 14 finished 0:26:31.498610
seq 15 finished 0:28:54.878312
seq 16 finished 0:30:08.216670
seq 17 finished 0:31:08.127635
seq 18 finished 0:35:35.351406
seq 19 finished 0:51:03.009308
seq 20 finished 1:04:58.573488
seq 21 finished 1:52:11.897869
seq 22 finished 2:01:41.368575

n_seq_warmup = 1
7
AR 0.22202967584766753 -0.10253890850258252 0.7591111794556359 0.758781661211309
SMap 0.2651266347113628 -0.10253890850258252 0.771534017266756 0.758781661211309
MSLRX 0.5252920261755198 -0.10253890850258252 0.8649744074163572 0.758781661211309
MXLRXSolu3 0.46741495546629963 -0.10253890850258252 0.843270139056699 0.758781661211309
MSLR 0.5280126478583449 -0.10253890850258252 0.8668856132334527 0.758781661211309
8
AR 0.2166994804098359 -0.110553662146348

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:00:19.344447


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 2 finished 0:00:47.798588


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 3 finished 0:03:38.709602
seq 4 finished 0:04:59.072827
seq 5 finished 0:06:05.339561
seq 6 finished 0:06:54.415857
seq 7 finished 0:08:43.094941
seq 8 finished 0:10:55.458845
seq 9 finished 0:12:19.929437
seq 10 finished 0:16:43.172789
seq 11 finished 0:19:04.646400
seq 12 finished 0:20:55.525165
seq 13 finished 0:22:45.762275
seq 14 finished 0:28:11.273408
seq 15 finished 0:30:56.621739
seq 16 finished 0:32:38.172051
seq 17 finished 0:34:09.987709
seq 18 finished 0:38:46.122121
seq 19 finished 0:55:11.246405
seq 20 finished 1:11:02.698733
seq 21 finished 2:03:49.316349
seq 22 finished 2:15:24.559751

n_seq_warmup = 1
7
AR 0.7801215701394545 0.7776165948741907 0.95043662164006 0.9535577841451767
SMap 0.7854012057278924 0.7776165948741907 0.9518522308636921 0.9535577841451767
MSLRX 0.7896535848176001 0.7776165948741907 0.9524065356801747 0.9535577841451767
MXLRXSolu3 0.7803460682573296 0.7776165948741907 0.9504877882385045 0.9535577841451767
MSLR 0.7852090149313283 0.77761659487419

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:00:17.279071


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 2 finished 0:00:46.451869


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 3 finished 0:03:32.706510
seq 4 finished 0:04:48.818490
seq 5 finished 0:05:51.586327
seq 6 finished 0:06:34.493959
seq 7 finished 0:08:19.138220
seq 8 finished 0:10:26.955070
seq 9 finished 0:11:50.036023
seq 10 finished 0:16:14.262800
seq 11 finished 0:18:31.485224
seq 12 finished 0:20:12.535574
seq 13 finished 0:21:54.695207
seq 14 finished 0:27:16.845797
seq 15 finished 0:29:53.757420
seq 16 finished 0:31:25.493895
seq 17 finished 0:32:42.638236
seq 18 finished 0:37:09.646294
seq 19 finished 0:52:06.910254
seq 20 finished 1:05:21.492146
seq 21 finished 1:48:40.015211
seq 22 finished 1:58:19.451640

n_seq_warmup = 1
7
AR 0.4684006345649172 0.39639830601669324 0.872982289747752 0.8779889470516352
SMap 0.505428783597704 0.39639830601669324 0.8768033997847776 0.8779889470516352
MSLRX 0.5638856144944813 0.39639830601669324 0.8954712094406041 0.8779889470516352
MXLRXSolu3 0.5460254715533712 0.39639830601669324 0.8894340379739909 0.8779889470516352
MSLR 0.5703390629991201 0.3963983060

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:00:10.039526


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 2 finished 0:00:26.627756


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 3 finished 0:02:59.985331


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 4 finished 0:04:04.488048


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 5 finished 0:04:51.808526


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 6 finished 0:05:19.574505
seq 7 finished 0:06:41.887298
seq 8 finished 0:08:23.862652
seq 9 finished 0:09:23.611928
seq 10 finished 0:13:03.965353
seq 11 finished 0:14:49.474070
seq 12 finished 0:16:03.296176
seq 13 finished 0:17:13.425710
seq 14 finished 0:21:35.295457
seq 15 finished 0:23:30.980927
seq 16 finished 0:24:26.979166
seq 17 finished 0:25:13.581949
seq 18 finished 0:28:45.518349
seq 19 finished 0:42:13.926889
seq 20 finished 0:55:16.027275
seq 21 finished 1:39:35.047969
seq 22 finished 1:48:17.960338

n_seq_warmup = 1
7
AR 0.19403950697131833 -0.10253890850258252 0.7557281254805474 0.758781661211309
SMap 0.24084192591036624 -0.10253890850258252 0.7605720436721513 0.758781661211309
MSLRX 0.47008698630950907 -0.10253890850258252 0.8493442586937895 0.758781661211309
MXLRXSolu3 0.42656808760572995 -0.10253890850258252 0.8311768194899056 0.758781661211309
MSLR 0.4738640393079494 -0.10253890850258252 0.8520682761802245 0.758781661211309
8
AR 0.18938184721767215 -0.1105536621

In [19]:

with open(r"res/res_vimsw_pred_0205_3.csv", "w") as fw:
    
    head = ["is_log10", "is_cross", "is_phase", "horizon_forecast", "reg_method", "n_seq_warmup", "len_warmup", "r2_naive", "rocauc_naive", "r2_ar", "rocauc_ar", "r2_smap", 
            "rocauc_smap", "r2_mslrx", "rocauc_mslrx", "tmp_r2_mslrxsolu3", "tmp_rocauc_mslrxsolu3", "tmp_r2_mslr", "tmp_rocauc_mslr", ]
    fw.write(",".join(head) + "\n")

    for tmp_setting in dict_setting_results:

        is_log, is_cross, is_phase, horizon_forecast, reg_method = tmp_setting
        dict_warmup_evaluation = dict_setting_results[tmp_setting]

        for n_seq_warmup in dict_warmup_evaluation:
            
            for cc in range(len(dict_warmup_evaluation[n_seq_warmup][0])):
                
                len_warmup = cc + 7
                tmp_r2_naive, tmp_rocauc_naive, tmp_r2_ar, tmp_rocauc_ar, tmp_r2_smap, tmp_rocauc_smap, tmp_r2_mslrx, tmp_rocauc_mslrx, tmp_r2_mslrxsolu3, tmp_rocauc_mslrxsolu3, tmp_r2_mslr, tmp_rocauc_mslr = dict_warmup_evaluation[n_seq_warmup][0][cc]
            
                line = [is_log, is_cross, is_phase, horizon_forecast, reg_method, n_seq_warmup, len_warmup, tmp_r2_naive, tmp_rocauc_naive, tmp_r2_ar, tmp_rocauc_ar, tmp_r2_smap, tmp_rocauc_smap, tmp_r2_mslrx, tmp_rocauc_mslrx, tmp_r2_mslrxsolu3, tmp_rocauc_mslrxsolu3, tmp_r2_mslr, tmp_rocauc_mslr]
                fw.write(",".join([str(i) for i in line]) + "\n")

    

In [None]:

dict_setting_results = dict()
for is_log in [True]:
    for is_cross in [True, False]:
        for is_phase in [True]:
            for horizon_forecast in [1, 3, 7]:
                
                #tmp_setting = (is_log, is_cross, is_phase, horizon_forecast, "LinearRegression", )
                tmp_setting = (is_log, is_cross, is_phase, horizon_forecast, "LinearSVR", )
                
                print()
                print("================================================")
                print("setting =", tmp_setting)
                print("================================================")
                
                dict_setting_results[tmp_setting] = _main(data_W_byday, n_seq_warmup=1, 
                                                          thres_quantile = 0.9, 
                                                          reg_method = "LinearSVR", reg_method_func = LinearSVR, kargs_reg = {"random_state": 434},
                                                          #reg_method = "LinearRegression", reg_method_func = LinearRegression, kargs_reg = {},
                                                          is_log = is_log, is_cross = is_cross, is_phase = is_phase, horizon_forecast = horizon_forecast,)
                 
                


In [None]:

with open(r"res/res_vimsw_pred_0205_4.csv", "w") as fw:
    
    head = ["is_log10", "is_cross", "is_phase", "horizon_forecast", "reg_method", "n_seq_warmup", "len_warmup", "r2_naive", "rocauc_naive", "r2_ar", "rocauc_ar", "r2_smap", 
            "rocauc_smap", "r2_mslrx", "rocauc_mslrx", "tmp_r2_mslrxsolu3", "tmp_rocauc_mslrxsolu3", "tmp_r2_mslr", "tmp_rocauc_mslr", ]
    fw.write(",".join(head) + "\n")

    for tmp_setting in dict_setting_results:

        is_log, is_cross, is_phase, horizon_forecast, reg_method = tmp_setting
        dict_warmup_evaluation = dict_setting_results[tmp_setting]

        for n_seq_warmup in dict_warmup_evaluation:
            
            for cc in range(len(dict_warmup_evaluation[n_seq_warmup][0])):
                
                len_warmup = cc + 7
                tmp_r2_naive, tmp_rocauc_naive, tmp_r2_ar, tmp_rocauc_ar, tmp_r2_smap, tmp_rocauc_smap, tmp_r2_mslrx, tmp_rocauc_mslrx, tmp_r2_mslrxsolu3, tmp_rocauc_mslrxsolu3, tmp_r2_mslr, tmp_rocauc_mslr = dict_warmup_evaluation[n_seq_warmup][0][cc]
            
                line = [is_log, is_cross, is_phase, horizon_forecast, reg_method, n_seq_warmup, len_warmup, tmp_r2_naive, tmp_rocauc_naive, tmp_r2_ar, tmp_rocauc_ar, tmp_r2_smap, tmp_rocauc_smap, tmp_r2_mslrx, tmp_rocauc_mslrx, tmp_r2_mslrxsolu3, tmp_rocauc_mslrxsolu3, tmp_r2_mslr, tmp_rocauc_mslr]
                fw.write(",".join([str(i) for i in line]) + "\n")

    

## Predictions (WW) - save prediction results

In [46]:
data_WW_byday = data_WW[["date", "conductivity", "turbidity"]].resample("1D").max()
data_WW_byday["temperature"] = data_WW["temperature"].resample("1D").mean()
data_WW_byday["pH"] = data_WW["pH"].resample("1D").mean()
data_WW_byday["ODO"] = data_WW["ODO"].resample("1D").min()
data_WW_byday["salinity_max"] = data_WW["salinity"].resample("1D").max()
data_WW_byday["salinity_min"] = data_WW["salinity"].resample("1D").min()
data_WW_byday["log10_chlorophyll"] = data_WW["log10_chlorophyll"].resample("1D").max()
data_WW_byday["chlorophyll"] = data_WW["chlorophyll"].resample("1D").max()

# data_WW_byday = data_WW[["temperature", "pH"]].resample("1D").mean()
# data_WW_byday["date"] = data_WW["date"].resample("1D").max()
# data_WW_byday["conductivity"] = data_WW["conductivity"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_WW_byday["turbidity"] = data_WW["turbidity"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_WW_byday["ODO"] = data_WW["ODO"].resample("1D").apply(lambda x: x.nsmallest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_WW_byday["salinity_max"] = data_WW["salinity"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_WW_byday["salinity_min"] = data_WW["salinity"].resample("1D").apply(lambda x: x.nsmallest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_WW_byday["log10_chlorophyll"] = data_WW["log10_chlorophyll"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_WW_byday["chlorophyll"] = data_WW["chlorophyll"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)


In [47]:
data_W_byday = data_W[["date"]].resample("1D").max()
#data_W_byday["airpressure"] = data_W["airpressure"].resample("1D").mean()
data_W_byday["wind"] = data_W["wind"].resample("1D").mean()

data_WW_byday = data_WW_byday.merge(data_W_byday, left_index=True, right_index=True).rename(columns = {"date_x": "date"}).drop(columns = ["date_y"])


In [48]:
data_WW_byday = data_WW_byday.dropna().resample("1D").max()

In [49]:
data_WW_byday

Unnamed: 0_level_0,date,conductivity,turbidity,temperature,pH,ODO,salinity_max,salinity_min,log10_chlorophyll,chlorophyll,wind
time_min,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-10-12,2018-10-12,46.978,47.06,24.194146,7.902439,5.84,30.55,28.48,1.295567,19.75,2.774167
2018-10-13,2018-10-13,46.748,29.83,22.100917,7.863437,5.89,30.41,28.41,1.480438,30.23,2.256528
2018-10-14,2018-10-14,46.615,18.17,20.642937,7.871354,6.06,30.33,28.59,1.342620,22.01,1.743472
2018-10-15,2018-10-15,46.385,15.75,21.330021,7.870104,6.32,30.16,28.72,1.330008,21.38,4.257870
2018-10-16,2018-10-16,46.521,18.81,21.417677,7.843646,5.63,30.26,28.59,1.221675,16.66,2.117500
...,...,...,...,...,...,...,...,...,...,...,...
2022-12-20,2022-12-20,49.115,5.79,5.802531,7.942708,9.53,31.56,28.73,0.429752,2.69,1.510417
2022-12-21,2022-12-21,49.668,9.33,5.474958,7.948021,9.63,31.91,29.18,0.459392,2.88,2.171071
2022-12-22,2022-12-22,49.776,50.00,5.977583,7.944375,9.74,32.03,29.50,0.563481,3.66,4.545590
2022-12-23,2022-12-23,48.842,219.66,7.366091,7.911136,9.03,31.47,28.84,0.847573,7.04,5.560694


In [50]:
def _main(data_WW_byday, is_log = True, is_cross = True, is_phase = True, horizon_forecast = 1, thres_quantile = 0.95, n_seq_warmup = 6, is_mslr = True, is_smap_cv = False,
          reg_method = "LinearSVR", reg_method_func = LinearSVR, kargs_reg = {"random_state": 434},
          #reg_method = "LinearRegression", reg_method_func = LinearRegression, kargs_reg = {},
         ):
    
    # Data Preparation
    if is_log:
        array_obs = data_WW_byday["log10_chlorophyll"].to_numpy()
    else:
        array_obs = data_WW_byday["chlorophyll"].to_numpy()

    array_datetime = data_WW_byday["date"].to_numpy()
    #array_cross = data_WW_byday[["salinity_max", "salinity_min", "ODO", "turbidity", "pH", "temperature"]].to_numpy()
    array_cross = data_WW_byday[["salinity_max", "salinity_min", "ODO", "turbidity", "pH", "temperature", "wind"]].to_numpy()
    
    list_split_index = list(np.where(np.isnan(array_obs))[0])
    list_split_index.append(len(array_obs))
    list_split_index.insert(0, 0)
    list_subseq, list_subseq_cross,list_subseq_datetime = [], [], []
    for tmp_index in range(len(list_split_index) - 1):
        tmp_index_left = list_split_index[tmp_index] + 1
        tmp_index_right = list_split_index[tmp_index + 1]
        if tmp_index_right - tmp_index_left < 21:
            continue
        list_subseq.append(array_obs[tmp_index_left:tmp_index_right])
        list_subseq_datetime.append(array_datetime[tmp_index_left:tmp_index_right])
        list_subseq_cross.append(array_cross[tmp_index_left:tmp_index_right])
    
    list_p_AR = [2, 1, ]
    list_p_AR_cross = [1, ]
    list_p_AR.sort(reverse = True)
    max_p = max(max(list_p_AR), max(list_p_AR_cross) if len(list_p_AR_cross) != 0 else 0)

    list_X, list_Y, list_exog = [], [], []
    list_date = []
    for tmp_subseq, tmp_subseq_datetime, tmp_subseq_cross in zip(list_subseq, list_subseq_datetime, list_subseq_cross):
        X, Y, exog = [], [], []
        tmp_list_date = []
        for tmp_index_right in range(max_p, len(tmp_subseq) - horizon_forecast + 1):

            tmp_index_test = tmp_index_right - 1 + horizon_forecast # tmp_index_right == tmp_index_test if horizon_forecast = 1

            tmp_X = []
            if is_cross:
                for tmp_p in list_p_AR_cross:
                    tmp_X.extend(tmp_subseq_cross[tmp_index_right - tmp_p])
            for tmp_p in list_p_AR:
                tmp_X.append(tmp_subseq[tmp_index_right - tmp_p])
            X.append(tmp_X)
            Y.append([tmp_subseq[tmp_index_test]])

            tmp_date = tmp_subseq_datetime[tmp_index_test]
            tmp_list_date.append(tmp_date)
            
            tmp_t = pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).toordinal() - pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).replace(month = 1, day = 1).toordinal()
            if is_phase:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi), np.sin(tmp_t / 365 * 2 * np.pi)]
            else:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi)]
            exog.append(tmp_exog)

        list_X.append(X)
        list_Y.append(Y)
        list_exog.append(exog)
        list_date.append(tmp_list_date)
    
    print("Subseqs:")
    for tmp_index in range(len(list_subseq)):
        print("\t", tmp_index, pd.Timestamp(list_subseq_datetime[tmp_index][1]).date(), pd.Timestamp(list_subseq_datetime[tmp_index][-1]).date(), len(list_subseq[tmp_index]))
    print()
    
    # Prediction
    
    start_time = datetime.datetime.now()

    list_y_true, list_y_pred_naive = [], []
    list_y_pred_mslrx, list_y_pred_mslr = [], []
    list_y_pred_mslrxsolu3 = []
    list_y_pred_smap, list_y_pred_ar = [], []
    list_y_date = []
    list_y_seq, list_y_seq_index = [], []
    for cc in range(n_seq_warmup, len(list_X)):
        
        tmp_list_y_true, tmp_list_y_pred_naive = [], []
        tmp_list_y_pred_mslrx, tmp_list_y_pred_mslr = [], []
        tmp_list_y_pred_mslrxsolu3 = []
        tmp_list_y_pred_smap, tmp_list_y_pred_ar = [], []
        tmp_list_y_date = []
        tmp_list_y_seq, tmp_list_y_seq_index = [], []
        for tmp_index_test in range(5, len(list_X[cc])):
            tmp_list_X, tmp_list_Y, tmp_list_exog = [], [], []
            tmp_list_X_test = []
            for ii in range(cc):
                tmp_list_X.append(copy.deepcopy(list_X[ii]))
                tmp_list_Y.append(copy.deepcopy(list_Y[ii]))
                tmp_list_exog.append(copy.deepcopy(list_exog[ii]))
                tmp_list_X_test.append(copy.deepcopy(list_X[ii][-1]))
            tmp_list_X.append(copy.deepcopy(list_X[cc][:tmp_index_test]))
            tmp_list_Y.append(copy.deepcopy(list_Y[cc][:tmp_index_test]))
            tmp_list_exog.append(copy.deepcopy(list_exog[cc][:tmp_index_test]))
            tmp_list_X_test.append(copy.deepcopy(list_X[cc][tmp_index_test]))
            
            tmp_y_true = list_Y[cc][tmp_index_test]
            tmp_y_pred_naive = list_X[cc][tmp_index_test][-1]
            
            if is_mslr:

                mslrx = MSLRX(n_components = 2, covariance_type="diag", n_iter = 10, is_logistic_regression_CV = False,
                              logistic_regression_C = 1e10, is_logistic_regression_standardized = False, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslrx.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_exog, tmp_list_X_test, is_multiple_sequence = True)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslrx.append(tmp_y_pred)

                mslrx = MSLRXSoluIII(n_components = 2, covariance_type="diag", n_iter = 10, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslrx.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_exog, tmp_list_X_test, is_multiple_sequence = True)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslrxsolu3.append(tmp_y_pred)

                mslr = MSLR(n_components = 2, covariance_type="diag", n_iter = 10, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslr.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_X_test, is_multiple_sequence = True)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslr.append(tmp_y_pred)
                
            else:
                tmp_list_y_pred_mslrx.append(0.)
                tmp_list_y_pred_mslrxsolu3.append(0.)
                tmp_list_y_pred_mslr.append(0.)
            
            lr = reg_method_func(**kargs_reg)
            lr.fit(np.concatenate(tmp_list_X), np.concatenate(tmp_list_Y).flatten())
            tmp_pred = lr.predict(tmp_list_X_test)
            tmp_y_pred = tmp_pred[-1]
            tmp_list_y_pred_ar.append(tmp_y_pred)
            
            if is_smap_cv:
                lr = SMapCV(thetas = (0.5, 1.0, 1.5, ), reg_method = reg_method, kargs_reg = kargs_reg)
            else:
                lr = SMap(theta = 0.5, reg_method = reg_method, kargs_reg = kargs_reg)
            lr.fit(np.concatenate(tmp_list_X), np.concatenate(tmp_list_Y).flatten())
            tmp_pred = lr.predict(tmp_list_X_test)
            tmp_y_pred = tmp_pred[-1]
            tmp_list_y_pred_smap.append(tmp_y_pred)

            tmp_list_y_true.append(tmp_y_true)
            tmp_list_y_pred_naive.append(tmp_y_pred_naive)
            tmp_list_y_date.append(list_date[cc][tmp_index_test])
            tmp_list_y_seq.append(cc)
            tmp_list_y_seq_index.append(tmp_index_test)
        
        list_y_date.append(tmp_list_y_date)
        list_y_seq.append(tmp_list_y_seq)
        list_y_seq_index.append(tmp_list_y_seq_index)
        if is_log:
            list_y_true.append(np.power(10, tmp_list_y_true))
            list_y_pred_naive.append(np.power(10, tmp_list_y_pred_naive))
            list_y_pred_mslrx.append(np.power(10, tmp_list_y_pred_mslrx))
            list_y_pred_mslr.append(np.power(10, tmp_list_y_pred_mslr))
            list_y_pred_mslrxsolu3.append(np.power(10, tmp_list_y_pred_mslrxsolu3))
            list_y_pred_ar.append(np.power(10, tmp_list_y_pred_ar))
            list_y_pred_smap.append(np.power(10, tmp_list_y_pred_smap))
        else:
            list_y_true.append(tmp_list_y_true)
            list_y_pred_naive.append(tmp_list_y_pred_naive)
            list_y_pred_mslrx.append(tmp_list_y_pred_mslrx)
            list_y_pred_mslr.append(tmp_list_y_pred_mslr)
            list_y_pred_mslrxsolu3.append(tmp_list_y_pred_mslrxsolu3)
            list_y_pred_ar.append(tmp_list_y_pred_ar)
            list_y_pred_smap.append(tmp_list_y_pred_smap)
        
        print("seq % s finished" % cc, str(datetime.datetime.now() - start_time))
        
    list_y_date, list_y_true, list_y_pred_naive = np.concatenate(list_y_date).flatten(), np.concatenate(list_y_true).flatten(), np.concatenate(list_y_pred_naive).flatten()
    list_y_pred_mslrx, list_y_pred_mslr, list_y_pred_mslrxsolu3, list_y_pred_ar, list_y_pred_smap = np.concatenate(list_y_pred_mslrx).flatten(), np.concatenate(list_y_pred_mslr).flatten(), np.concatenate(list_y_pred_mslrxsolu3).flatten(), np.concatenate(list_y_pred_ar).flatten(), np.concatenate(list_y_pred_smap).flatten() 
    list_y_seq, list_y_seq_index = np.concatenate(list_y_seq).flatten(), np.concatenate(list_y_seq_index).flatten()
    
    dict_method_pred = {"date": list_y_date, 
                        "true": list_y_true, 
                        "pred_naive": list_y_pred_naive, 
                        "pred_mslrx": list_y_pred_mslrx,
                        "pred_mslr": list_y_pred_mslr,
                        "pred_mslrxsolu3": list_y_pred_mslrxsolu3, 
                        "pred_ar": list_y_pred_ar,
                        "pred_smap": list_y_pred_smap,
                        "seq": list_y_seq,
                        "seq_index": list_y_seq_index,
                       }
                
    return pd.DataFrame(dict_method_pred)


In [51]:

dict_setting_results = dict()
for is_log in [True, ]:
    for is_cross in [True, False]:
        for is_phase in [True]:
            for horizon_forecast in [1, 3, 7]:
                
                tmp_setting = (is_log, is_cross, is_phase, horizon_forecast, "LinearRegression", )
                
                print()
                print("================================================")
                print("setting =", tmp_setting)
                print("================================================")
                
                tmp_res_df = _main(data_WW_byday, n_seq_warmup=1, 
                                  #reg_method = "LinearSVR", reg_method_func = LinearSVR, kargs_reg = {"random_state": 434},
                                  reg_method = "LinearRegression", reg_method_func = LinearRegression, kargs_reg = {},
                                  is_log = is_log, is_cross = is_cross, is_phase = is_phase, horizon_forecast = horizon_forecast,
                                  is_mslr = False)
                
                dict_setting_results[tmp_setting] = tmp_res_df
                
                tail_log = "_log" if is_log else ""
                tail_cross = "_X" if is_cross else ""
                tail_horizon = "% sD" % horizon_forecast
                tail_method = "LR"
                #tail_max = "_2ndmax"
                tail_max = ""
                
                #fname_write = "vimsWW_wind% s_pred_% s_% s% s% s.csv" % (tail_max, tail_method, tail_horizon, tail_cross, tail_log, )
                fname_write = "vimsWW_wind% s_pred_nomslr_% s_% s% s% s.csv" % (tail_max, tail_method, tail_horizon, tail_cross, tail_log, )
                tmp_res_df.to_csv(os.path.join("res", "vimsWW_pred", fname_write), index = False)



setting = (True, True, True, 1, 'LinearRegression')
Subseqs:
	 0 2018-11-01 2019-01-22 84
	 1 2019-02-02 2019-03-24 52
	 2 2019-03-27 2019-06-24 91
	 3 2019-06-27 2019-07-25 30
	 4 2019-08-02 2019-11-16 108
	 5 2019-11-20 2020-02-12 86
	 6 2020-02-15 2020-03-27 43
	 7 2020-03-31 2020-07-16 109
	 8 2020-09-04 2020-09-25 23
	 9 2021-02-09 2021-03-07 28
	 10 2021-04-01 2021-06-28 90
	 11 2021-07-01 2021-07-21 22
	 12 2021-07-29 2021-08-31 35
	 13 2021-09-03 2021-10-27 56
	 14 2021-10-30 2022-01-01 65
	 15 2022-01-15 2022-03-26 72
	 16 2022-03-29 2022-07-23 118
	 17 2022-07-28 2022-12-09 136

seq 1 finished 0:00:00.371526
seq 2 finished 0:00:01.459102
seq 3 finished 0:00:01.840743
seq 4 finished 0:00:03.997401
seq 5 finished 0:00:06.071699
seq 6 finished 0:00:07.215299
seq 7 finished 0:00:10.801829
seq 8 finished 0:00:11.466419
seq 9 finished 0:00:12.324745
seq 10 finished 0:00:16.084025
seq 11 finished 0:00:16.828694
seq 12 finished 0:00:18.386043
seq 13 finished 0:00:21.172474
seq 14 fi

## Predictions (W) - save prediction results

In [13]:

data_W_byday = data_W[["temperature", "wind"]].resample("1D").mean()
data_W_byday["date"] = data_W["date"].resample("1D").max()
data_W_byday["ODO"] = data_W["ODO"].resample("1D").min()
data_W_byday["salinity_max"] = data_W["salinity"].resample("1D").max()
data_W_byday["salinity_min"] = data_W["salinity"].resample("1D").min()
data_W_byday["log10_chlorophyll"] = data_W["log10_chlorophyll"].resample("1D").max()
data_W_byday["chlorophyll"] = data_W["chlorophyll"].resample("1D").max()

# data_W_byday = data_W[["temperature", "wind"]].resample("1D").mean()
# data_W_byday["date"] = data_W["date"].resample("1D").max()
# data_W_byday["ODO"] = data_W["ODO"].resample("1D").apply(lambda x: x.nsmallest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_W_byday["salinity_max"] = data_W["salinity"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_W_byday["salinity_min"] = data_W["salinity"].resample("1D").apply(lambda x: x.nsmallest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_W_byday["log10_chlorophyll"] = data_W["log10_chlorophyll"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_W_byday["chlorophyll"] = data_W["chlorophyll"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)


In [14]:
data_W_byday = data_W_byday.dropna().resample("1D").max()

In [15]:
data_W_byday

Unnamed: 0_level_0,temperature,wind,date,ODO,salinity_max,salinity_min,log10_chlorophyll,chlorophyll
time_min,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-03-25,16.105438,3.060000,2016-03-25,7.820000,31.4775,30.9550,0.360495,2.293480
2016-03-26,15.533741,4.754167,2016-03-26,7.125000,31.4075,27.5425,0.576452,3.770964
2016-03-27,13.722917,5.606726,2016-03-27,7.417500,31.3725,28.8325,0.518692,3.301350
2016-03-28,,,,,,,,
2016-03-29,14.414156,2.348611,2016-03-29,7.786667,30.8775,26.2625,0.674259,4.723450
...,...,...,...,...,...,...,...,...
2022-12-27,1.523598,0.682335,2022-12-27,10.987500,31.8500,30.0550,0.437470,2.738228
2022-12-28,2.451531,1.003393,2022-12-28,11.030000,31.8800,30.1025,0.505406,3.201885
2022-12-29,3.539875,1.933671,2022-12-29,10.935000,31.8700,30.1475,0.487113,3.069820
2022-12-30,4.775479,1.388444,2022-12-30,10.742500,31.8975,30.1700,0.480288,3.021955


In [16]:
def _main(data_WW_byday, is_log = True, is_cross = True, is_phase = True, horizon_forecast = 1, thres_quantile = 0.95, n_seq_warmup = 6, is_mslr = True, is_smap_cv = False,
          reg_method = "LinearSVR", reg_method_func = LinearSVR, kargs_reg = {"random_state": 434},
          #reg_method = "LinearRegression", reg_method_func = LinearRegression, kargs_reg = {},
         ):
    
    # Data Preparation
    if is_log:
        array_obs = data_WW_byday["log10_chlorophyll"].to_numpy()
    else:
        array_obs = data_WW_byday["chlorophyll"].to_numpy()

    array_datetime = data_WW_byday["date"].to_numpy()
    #array_cross = data_WW_byday[["salinity_max", "salinity_min", "ODO", "temperature"]].to_numpy()
    array_cross = data_WW_byday[["salinity_max", "salinity_min", "ODO", "temperature", "wind"]].to_numpy()
    
    list_split_index = list(np.where(np.isnan(array_obs))[0])
    list_split_index.append(len(array_obs))
    list_split_index.insert(0, 0)
    list_subseq, list_subseq_cross,list_subseq_datetime = [], [], []
    for tmp_index in range(len(list_split_index) - 1):
        tmp_index_left = list_split_index[tmp_index] + 1
        tmp_index_right = list_split_index[tmp_index + 1]
        if tmp_index_right - tmp_index_left < 21:
            continue
        list_subseq.append(array_obs[tmp_index_left:tmp_index_right])
        list_subseq_datetime.append(array_datetime[tmp_index_left:tmp_index_right])
        list_subseq_cross.append(array_cross[tmp_index_left:tmp_index_right])
    
    list_p_AR = [2, 1, ]
    list_p_AR_cross = [1, ]
    list_p_AR.sort(reverse = True)
    max_p = max(max(list_p_AR), max(list_p_AR_cross) if len(list_p_AR_cross) != 0 else 0)

    list_X, list_Y, list_exog = [], [], []
    list_date = []
    for tmp_subseq, tmp_subseq_datetime, tmp_subseq_cross in zip(list_subseq, list_subseq_datetime, list_subseq_cross):
        X, Y, exog = [], [], []
        tmp_list_date = []
        for tmp_index_right in range(max_p, len(tmp_subseq) - horizon_forecast + 1):

            tmp_index_test = tmp_index_right - 1 + horizon_forecast # tmp_index_right == tmp_index_test if horizon_forecast = 1

            tmp_X = []
            if is_cross:
                for tmp_p in list_p_AR_cross:
                    tmp_X.extend(tmp_subseq_cross[tmp_index_right - tmp_p])
            for tmp_p in list_p_AR:
                tmp_X.append(tmp_subseq[tmp_index_right - tmp_p])
            X.append(tmp_X)
            Y.append([tmp_subseq[tmp_index_test]])

            tmp_date = tmp_subseq_datetime[tmp_index_test]
            tmp_list_date.append(tmp_date)
            
            tmp_t = pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).toordinal() - pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).replace(month = 1, day = 1).toordinal()
            if is_phase:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi), np.sin(tmp_t / 365 * 2 * np.pi)]
            else:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi)]
            exog.append(tmp_exog)

        list_X.append(X)
        list_Y.append(Y)
        list_exog.append(exog)
        list_date.append(tmp_list_date)
    
    print("Subseqs:")
    for tmp_index in range(len(list_subseq)):
        print("\t", tmp_index, pd.Timestamp(list_subseq_datetime[tmp_index][1]).date(), pd.Timestamp(list_subseq_datetime[tmp_index][-1]).date(), len(list_subseq[tmp_index]))
    print()
    
    # Prediction
    
    start_time = datetime.datetime.now()

    list_y_true, list_y_pred_naive = [], []
    list_y_pred_mslrx, list_y_pred_mslr = [], []
    list_y_pred_mslrxsolu3 = []
    list_y_pred_smap, list_y_pred_ar = [], []
    list_y_date = []
    list_y_seq, list_y_seq_index = [], []
    for cc in range(n_seq_warmup, len(list_X)):
        
        tmp_list_y_true, tmp_list_y_pred_naive = [], []
        tmp_list_y_pred_mslrx, tmp_list_y_pred_mslr = [], []
        tmp_list_y_pred_mslrxsolu3 = []
        tmp_list_y_pred_smap, tmp_list_y_pred_ar = [], []
        tmp_list_y_date = []
        tmp_list_y_seq, tmp_list_y_seq_index = [], []
        for tmp_index_test in range(5, len(list_X[cc])):
            tmp_list_X, tmp_list_Y, tmp_list_exog = [], [], []
            tmp_list_X_test = []
            for ii in range(cc):
                tmp_list_X.append(copy.deepcopy(list_X[ii]))
                tmp_list_Y.append(copy.deepcopy(list_Y[ii]))
                tmp_list_exog.append(copy.deepcopy(list_exog[ii]))
                tmp_list_X_test.append(copy.deepcopy(list_X[ii][-1]))
            tmp_list_X.append(copy.deepcopy(list_X[cc][:tmp_index_test]))
            tmp_list_Y.append(copy.deepcopy(list_Y[cc][:tmp_index_test]))
            tmp_list_exog.append(copy.deepcopy(list_exog[cc][:tmp_index_test]))
            tmp_list_X_test.append(copy.deepcopy(list_X[cc][tmp_index_test]))
            
            tmp_y_true = list_Y[cc][tmp_index_test]
            tmp_y_pred_naive = list_X[cc][tmp_index_test][-1]
            
            if is_mslr:

                mslrx = MSLRX(n_components = 2, covariance_type="diag", n_iter = 10, is_logistic_regression_CV = False,
                              logistic_regression_C = 1e10, is_logistic_regression_standardized = False, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslrx.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_exog, tmp_list_X_test, is_multiple_sequence = True)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslrx.append(tmp_y_pred)

                mslrx = MSLRXSoluIII(n_components = 2, covariance_type="diag", n_iter = 10, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslrx.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_exog, tmp_list_X_test, is_multiple_sequence = True)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslrxsolu3.append(tmp_y_pred)

                mslr = MSLR(n_components = 2, covariance_type="diag", n_iter = 10, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslr.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_X_test, is_multiple_sequence = True)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslr.append(tmp_y_pred)
                
            else:
                tmp_list_y_pred_mslrx.append(0.)
                tmp_list_y_pred_mslrxsolu3.append(0.)
                tmp_list_y_pred_mslr.append(0.)
            
            lr = reg_method_func(**kargs_reg)
            lr.fit(np.concatenate(tmp_list_X), np.concatenate(tmp_list_Y).flatten())
            tmp_pred = lr.predict(tmp_list_X_test)
            tmp_y_pred = tmp_pred[-1]
            tmp_list_y_pred_ar.append(tmp_y_pred)
            
            if is_smap_cv:
                lr = SMapCV(thetas = (0.5, 1.0, 1.5, ), reg_method = reg_method, kargs_reg = kargs_reg)
            else:
                lr = SMap(theta = 0.5, reg_method = reg_method, kargs_reg = kargs_reg)
            lr.fit(np.concatenate(tmp_list_X), np.concatenate(tmp_list_Y).flatten())
            tmp_pred = lr.predict(tmp_list_X_test)
            tmp_y_pred = tmp_pred[-1]
            tmp_list_y_pred_smap.append(tmp_y_pred)

            tmp_list_y_true.append(tmp_y_true)
            tmp_list_y_pred_naive.append(tmp_y_pred_naive)
            tmp_list_y_date.append(list_date[cc][tmp_index_test])
            tmp_list_y_seq.append(cc)
            tmp_list_y_seq_index.append(tmp_index_test)
        
        list_y_date.append(tmp_list_y_date)
        list_y_seq.append(tmp_list_y_seq)
        list_y_seq_index.append(tmp_list_y_seq_index)
        if is_log:
            list_y_true.append(np.power(10, tmp_list_y_true))
            list_y_pred_naive.append(np.power(10, tmp_list_y_pred_naive))
            list_y_pred_mslrx.append(np.power(10, tmp_list_y_pred_mslrx))
            list_y_pred_mslr.append(np.power(10, tmp_list_y_pred_mslr))
            list_y_pred_mslrxsolu3.append(np.power(10, tmp_list_y_pred_mslrxsolu3))
            list_y_pred_ar.append(np.power(10, tmp_list_y_pred_ar))
            list_y_pred_smap.append(np.power(10, tmp_list_y_pred_smap))
        else:
            list_y_true.append(tmp_list_y_true)
            list_y_pred_naive.append(tmp_list_y_pred_naive)
            list_y_pred_mslrx.append(tmp_list_y_pred_mslrx)
            list_y_pred_mslr.append(tmp_list_y_pred_mslr)
            list_y_pred_mslrxsolu3.append(tmp_list_y_pred_mslrxsolu3)
            list_y_pred_ar.append(tmp_list_y_pred_ar)
            list_y_pred_smap.append(tmp_list_y_pred_smap)
        
        print("seq % s finished" % cc, str(datetime.datetime.now() - start_time))
        
    list_y_date, list_y_true, list_y_pred_naive = np.concatenate(list_y_date).flatten(), np.concatenate(list_y_true).flatten(), np.concatenate(list_y_pred_naive).flatten()
    list_y_pred_mslrx, list_y_pred_mslr, list_y_pred_mslrxsolu3, list_y_pred_ar, list_y_pred_smap = np.concatenate(list_y_pred_mslrx).flatten(), np.concatenate(list_y_pred_mslr).flatten(), np.concatenate(list_y_pred_mslrxsolu3).flatten(), np.concatenate(list_y_pred_ar).flatten(), np.concatenate(list_y_pred_smap).flatten() 
    list_y_seq, list_y_seq_index = np.concatenate(list_y_seq).flatten(), np.concatenate(list_y_seq_index).flatten()
    
    dict_method_pred = {"date": list_y_date, 
                        "true": list_y_true, 
                        "pred_naive": list_y_pred_naive, 
                        "pred_mslrx": list_y_pred_mslrx,
                        "pred_mslr": list_y_pred_mslr,
                        "pred_mslrxsolu3": list_y_pred_mslrxsolu3, 
                        "pred_ar": list_y_pred_ar,
                        "pred_smap": list_y_pred_smap,
                        "seq": list_y_seq,
                        "seq_index": list_y_seq_index,
                       }
                
    return pd.DataFrame(dict_method_pred)


In [17]:

dict_setting_results = dict()
for is_log in [True, ]:
    for is_cross in [True, False]:
        for is_phase in [True]:
            for horizon_forecast in [1, 3, 7]:
                
                tmp_setting = (is_log, is_cross, is_phase, horizon_forecast, "LinearRegression", )
                
                print()
                print("================================================")
                print("setting =", tmp_setting)
                print("================================================")
                
                tmp_res_df = _main(data_W_byday, n_seq_warmup=1, 
                                                          #reg_method = "LinearSVR", reg_method_func = LinearSVR, kargs_reg = {"random_state": 434},
                                                          reg_method = "LinearRegression", reg_method_func = LinearRegression, kargs_reg = {},
                                                          is_log = is_log, is_cross = is_cross, is_phase = is_phase, horizon_forecast = horizon_forecast,)
                
                dict_setting_results[tmp_setting] = tmp_res_df
                
                tail_log = "_log" if is_log else ""
                tail_cross = "_X" if is_cross else ""
                tail_horizon = "% sD" % horizon_forecast
                tail_method = "LR"
                #tail_max = "_2ndmax"
                tail_max = ""
                
                fname_write = "vimsW_wind% s_pred_% s_% s% s% s.csv" % (tail_max, tail_method, tail_horizon, tail_cross, tail_log, )
                tmp_res_df.to_csv(os.path.join("res", "vimsW_pred", fname_write), index = False)



setting = (True, True, True, 1, 'LinearRegression')
Subseqs:
	 0 2016-04-19 2016-05-10 23
	 1 2016-06-07 2016-06-30 25
	 2 2016-07-19 2016-08-13 27
	 3 2016-08-31 2016-11-23 86
	 4 2017-04-06 2017-05-11 37
	 5 2017-06-16 2017-07-13 29
	 6 2017-07-19 2017-08-08 22
	 7 2017-09-13 2017-10-19 38
	 8 2017-11-01 2017-12-10 41
	 9 2018-01-26 2018-02-22 29
	 10 2018-04-10 2018-06-15 68
	 11 2019-02-05 2019-03-12 37
	 12 2019-03-21 2019-04-17 29
	 13 2019-04-25 2019-05-21 28
	 14 2019-05-24 2019-07-25 64
	 15 2019-08-01 2019-09-02 34
	 16 2019-09-06 2019-09-27 23
	 17 2020-04-15 2020-05-04 21
	 18 2020-12-12 2021-01-27 48
	 19 2021-03-26 2021-08-05 134
	 20 2021-08-13 2021-12-01 112
	 21 2021-12-16 2022-10-23 313
	 22 2022-10-26 2022-12-31 68



  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:00:21.832589


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 2 finished 0:00:53.607901
seq 3 finished 0:04:05.108478
seq 4 finished 0:05:32.787702
seq 5 finished 0:06:48.688351
seq 6 finished 0:07:45.986181
seq 7 finished 0:09:47.912769
seq 8 finished 0:12:15.934658
seq 9 finished 0:14:01.909063
seq 10 finished 0:19:23.499871
seq 11 finished 0:22:10.455717
seq 12 finished 0:24:16.532247
seq 13 finished 0:26:26.616835
seq 14 finished 0:32:45.061417
seq 15 finished 0:36:04.141441
seq 16 finished 0:37:53.914666
seq 17 finished 0:39:42.722319
seq 18 finished 0:45:04.766149
seq 19 finished 1:03:10.064465
seq 20 finished 1:20:57.547167
seq 21 finished 2:14:44.068325
seq 22 finished 2:25:40.147326

setting = (True, True, True, 3, 'LinearRegression')
Subseqs:
	 0 2016-04-19 2016-05-10 23
	 1 2016-06-07 2016-06-30 25
	 2 2016-07-19 2016-08-13 27
	 3 2016-08-31 2016-11-23 86
	 4 2017-04-06 2017-05-11 37
	 5 2017-06-16 2017-07-13 29
	 6 2017-07-19 2017-08-08 22
	 7 2017-09-13 2017-10-19 38
	 8 2017-11-01 2017-12-10 41
	 9 2018-01-26 2018-02-22 29
	 10 

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:00:13.730000


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 2 finished 0:00:37.566025


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 3 finished 0:03:19.944000
seq 4 finished 0:04:31.271000
seq 5 finished 0:05:25.100000
seq 6 finished 0:06:01.961000
seq 7 finished 0:07:33.034000
seq 8 finished 0:09:32.505001
seq 9 finished 0:10:52.814001
seq 10 finished 0:15:07.627000
seq 11 finished 0:17:15.056000
seq 12 finished 0:18:48.187002
seq 13 finished 0:20:22.365000
seq 14 finished 0:25:25.934000
seq 15 finished 0:27:55.249001
seq 16 finished 0:29:30.355809
seq 17 finished 0:31:04.028038
seq 18 finished 0:36:26.666095
seq 19 finished 0:55:19.410424
seq 20 finished 1:12:31.737877
seq 21 finished 2:06:20.997555
seq 22 finished 2:18:33.958955

setting = (True, True, True, 7, 'LinearRegression')
Subseqs:
	 0 2016-04-19 2016-05-10 23
	 1 2016-06-07 2016-06-30 25
	 2 2016-07-19 2016-08-13 27
	 3 2016-08-31 2016-11-23 86
	 4 2017-04-06 2017-05-11 37
	 5 2017-06-16 2017-07-13 29
	 6 2017-07-19 2017-08-08 22
	 7 2017-09-13 2017-10-19 38
	 8 2017-11-01 2017-12-10 41
	 9 2018-01-26 2018-02-22 29
	 10 2018-04-10 2018-06-15 68
	 11 

  tmp_log_prob.append(logsumexp([np.log(self.transmat_[ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(self.transmat_[ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(self.transmat_[ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(self.transmat_[ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(self.transmat_[ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(self.transmat_[ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(self.transmat_[ii, j]) + list_cur_ma

seq 1 finished 0:00:10.834312
seq 2 finished 0:00:28.181547


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 3 finished 0:03:06.567053
seq 4 finished 0:04:13.988919
seq 5 finished 0:05:04.711313
seq 6 finished 0:05:31.830895
seq 7 finished 0:06:57.361018
seq 8 finished 0:08:46.339071
seq 9 finished 0:09:53.856026
seq 10 finished 0:14:04.534899
seq 11 finished 0:15:51.513825
seq 12 finished 0:17:09.748199
seq 13 finished 0:18:23.726632
seq 14 finished 0:23:04.320920
seq 15 finished 0:25:14.897258
seq 16 finished 0:26:17.027490
seq 17 finished 0:27:07.404286
seq 18 finished 0:31:01.656633
seq 19 finished 0:47:41.565772
seq 20 finished 1:02:51.923726
seq 21 finished 1:56:01.079027
seq 22 finished 2:06:39.424316

setting = (True, False, True, 1, 'LinearRegression')
Subseqs:
	 0 2016-04-19 2016-05-10 23
	 1 2016-06-07 2016-06-30 25
	 2 2016-07-19 2016-08-13 27
	 3 2016-08-31 2016-11-23 86
	 4 2017-04-06 2017-05-11 37
	 5 2017-06-16 2017-07-13 29
	 6 2017-07-19 2017-08-08 22
	 7 2017-09-13 2017-10-19 38
	 8 2017-11-01 2017-12-10 41
	 9 2018-01-26 2018-02-22 29
	 10 2018-04-10 2018-06-15 68
	 11

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:00:21.165066


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 2 finished 0:00:52.929432
seq 3 finished 0:04:01.742813
seq 4 finished 0:05:29.924815
seq 5 finished 0:06:40.116345
seq 6 finished 0:07:31.761855
seq 7 finished 0:09:31.235842
seq 8 finished 0:11:53.872610
seq 9 finished 0:13:28.268921
seq 10 finished 0:18:00.990845
seq 11 finished 0:20:31.652035
seq 12 finished 0:22:22.100166
seq 13 finished 0:24:13.099405
seq 14 finished 0:29:50.488235
seq 15 finished 0:32:51.995558
seq 16 finished 0:34:54.201448
seq 17 finished 0:36:41.238811
seq 18 finished 0:42:03.174933
seq 19 finished 0:59:49.516337
seq 20 finished 1:16:28.187588
seq 21 finished 2:08:39.599757
seq 22 finished 2:19:39.157759

setting = (True, False, True, 3, 'LinearRegression')
Subseqs:
	 0 2016-04-19 2016-05-10 23
	 1 2016-06-07 2016-06-30 25
	 2 2016-07-19 2016-08-13 27
	 3 2016-08-31 2016-11-23 86
	 4 2017-04-06 2017-05-11 37
	 5 2017-06-16 2017-07-13 29
	 6 2017-07-19 2017-08-08 22
	 7 2017-09-13 2017-10-19 38
	 8 2017-11-01 2017-12-10 41
	 9 2018-01-26 2018-02-22 29
	 10

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:00:14.936999


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 2 finished 0:00:40.342000


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 3 finished 0:03:25.104000
seq 4 finished 0:04:42.165000
seq 5 finished 0:05:36.775002
seq 6 finished 0:06:15.892000
seq 7 finished 0:07:51.700000
seq 8 finished 0:09:52.169000
seq 9 finished 0:11:09.950000
seq 10 finished 0:15:24.377001
seq 11 finished 0:17:27.526000
seq 12 finished 0:19:00.804001
seq 13 finished 0:20:34.114000
seq 14 finished 0:25:32.380000
seq 15 finished 0:28:03.372001
seq 16 finished 0:29:49.319000
seq 17 finished 0:31:22.059000
seq 18 finished 0:36:24.245001
seq 19 finished 0:54:09.470002
seq 20 finished 1:09:50.525019
seq 21 finished 2:01:35.015082
seq 22 finished 2:12:21.437081

setting = (True, False, True, 7, 'LinearRegression')
Subseqs:
	 0 2016-04-19 2016-05-10 23
	 1 2016-06-07 2016-06-30 25
	 2 2016-07-19 2016-08-13 27
	 3 2016-08-31 2016-11-23 86
	 4 2017-04-06 2017-05-11 37
	 5 2017-06-16 2017-07-13 29
	 6 2017-07-19 2017-08-08 22
	 7 2017-09-13 2017-10-19 38
	 8 2017-11-01 2017-12-10 41
	 9 2018-01-26 2018-02-22 29
	 10 2018-04-10 2018-06-15 68
	 11

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:00:10.568000
seq 2 finished 0:00:26.901002


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 3 finished 0:02:55.586000
seq 4 finished 0:03:56.653001
seq 5 finished 0:04:41.868000
seq 6 finished 0:05:09.203999
seq 7 finished 0:06:30.855000
seq 8 finished 0:08:07.600002
seq 9 finished 0:09:07.439001
seq 10 finished 0:12:44.651001
seq 11 finished 0:14:21.089002
seq 12 finished 0:15:29.710000
seq 13 finished 0:16:37.590002
seq 14 finished 0:20:54.237000
seq 15 finished 0:22:47.321002
seq 16 finished 0:23:43.982999
seq 17 finished 0:24:30.085000
seq 18 finished 0:28:04.232000
seq 19 finished 0:44:21.604000
seq 20 finished 0:58:57.852000
seq 21 finished 1:48:49.800001
seq 22 finished 1:58:34.236001


## Predictions (WW) - save prediction results - feature importance

In [18]:

# 1stmax
data_WW_byday = data_WW[["date", "conductivity", "turbidity"]].resample("1D").max()
data_WW_byday["temperature"] = data_WW["temperature"].resample("1D").mean()
data_WW_byday["pH"] = data_WW["pH"].resample("1D").mean()
data_WW_byday["ODO"] = data_WW["ODO"].resample("1D").min()
data_WW_byday["salinity_max"] = data_WW["salinity"].resample("1D").max()
data_WW_byday["salinity_min"] = data_WW["salinity"].resample("1D").min()
data_WW_byday["log10_chlorophyll"] = data_WW["log10_chlorophyll"].resample("1D").max()
data_WW_byday["chlorophyll"] = data_WW["chlorophyll"].resample("1D").max()

# # 2ndmax
# data_WW_byday = data_WW[["temperature", "pH"]].resample("1D").mean()
# data_WW_byday["date"] = data_WW["date"].resample("1D").max()
# data_WW_byday["conductivity"] = data_WW["conductivity"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_WW_byday["turbidity"] = data_WW["turbidity"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_WW_byday["ODO"] = data_WW["ODO"].resample("1D").apply(lambda x: x.nsmallest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_WW_byday["salinity_max"] = data_WW["salinity"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_WW_byday["salinity_min"] = data_WW["salinity"].resample("1D").apply(lambda x: x.nsmallest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_WW_byday["log10_chlorophyll"] = data_WW["log10_chlorophyll"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)
# data_WW_byday["chlorophyll"] = data_WW["chlorophyll"].resample("1D").apply(lambda x: x.nlargest(2).iloc[-1] if len(x) > 0 else np.nan)

# # 90-quantile
# data_WW_byday = data_WW[["temperature", "pH"]].resample("1D").mean()
# data_WW_byday["date"] = data_WW["date"].resample("1D").max()
# data_WW_byday["conductivity"] = data_WW["conductivity"].resample("1D").apply(lambda x: np.quantile(x, 0.9) if len(x) >= 2 else np.nan)
# data_WW_byday["turbidity"] = data_WW["turbidity"].resample("1D").apply(lambda x: np.quantile(x, 0.9) if len(x) >= 2 else np.nan)
# data_WW_byday["ODO"] = data_WW["ODO"].resample("1D").apply(lambda x: np.quantile(x, 0.1) if len(x) >= 2 else np.nan)
# data_WW_byday["salinity_max"] = data_WW["salinity"].resample("1D").apply(lambda x: np.quantile(x, 0.9) if len(x) >= 2 else np.nan)
# data_WW_byday["salinity_min"] = data_WW["salinity"].resample("1D").apply(lambda x: np.quantile(x, 0.1) if len(x) >= 2 else np.nan)
# data_WW_byday["log10_chlorophyll"] = data_WW["log10_chlorophyll"].resample("1D").apply(lambda x: np.quantile(x, 0.9) if len(x) >= 2 else np.nan)
# data_WW_byday["chlorophyll"] = data_WW["chlorophyll"].resample("1D").apply(lambda x: np.quantile(x, 0.9) if len(x) >= 2 else np.nan)


In [19]:
data_W_byday = data_W[["date"]].resample("1D").max()
#data_W_byday["airpressure"] = data_W["airpressure"].resample("1D").mean()
data_W_byday["wind"] = data_W["wind"].resample("1D").mean()

data_WW_byday = data_WW_byday.merge(data_W_byday, left_index=True, right_index=True).rename(columns = {"date_x": "date"}).drop(columns = ["date_y"])


In [20]:
data_WW_byday = data_WW_byday.dropna().resample("1D").max()

In [21]:
data_WW_byday

Unnamed: 0_level_0,date,conductivity,turbidity,temperature,pH,ODO,salinity_max,salinity_min,log10_chlorophyll,chlorophyll,wind
time_min,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-10-12,2018-10-12,46.978,47.06,24.194146,7.902439,5.84,30.55,28.48,1.295567,19.75,2.774167
2018-10-13,2018-10-13,46.748,29.83,22.100917,7.863437,5.89,30.41,28.41,1.480438,30.23,2.256528
2018-10-14,2018-10-14,46.615,18.17,20.642937,7.871354,6.06,30.33,28.59,1.342620,22.01,1.743472
2018-10-15,2018-10-15,46.385,15.75,21.330021,7.870104,6.32,30.16,28.72,1.330008,21.38,4.257870
2018-10-16,2018-10-16,46.521,18.81,21.417677,7.843646,5.63,30.26,28.59,1.221675,16.66,2.117500
...,...,...,...,...,...,...,...,...,...,...,...
2022-12-20,2022-12-20,49.115,5.79,5.802531,7.942708,9.53,31.56,28.73,0.429752,2.69,1.510417
2022-12-21,2022-12-21,49.668,9.33,5.474958,7.948021,9.63,31.91,29.18,0.459392,2.88,2.171071
2022-12-22,2022-12-22,49.776,50.00,5.977583,7.944375,9.74,32.03,29.50,0.563481,3.66,4.545590
2022-12-23,2022-12-23,48.842,219.66,7.366091,7.911136,9.03,31.47,28.84,0.847573,7.04,5.560694


In [22]:
def _main(data_byday, 
          is_log = True, is_cross = True, is_phase = True, horizon_forecast = 1, thres_quantile = 0.95, 
          n_seq_warmup = 6, is_mslr = True, is_smap_cv = False, list_unused_feature = None,
          #reg_method = "LinearSVR", reg_method_func = LinearSVR, kargs_reg = {"random_state": 434},
          reg_method = "LinearRegression", reg_method_func = LinearRegression, kargs_reg = {},
         ):
    
    list_features = ["salinity_max", "salinity_min", "ODO", "turbidity", "pH", "temperature", "wind"]
    if list_unused_feature is not None:
        for tmp_feature in list_unused_feature:
            list_features.remove(tmp_feature)
            
    if is_log:
        array_obs = data_byday["log10_chlorophyll"].to_numpy()
    else:
        array_obs = data_byday["chlorophyll"].to_numpy()

    array_datetime = data_byday["date"].to_numpy()
    array_cross = data_byday[list_features].to_numpy()
    
    list_split_index = list(np.where(np.isnan(array_obs))[0])
    list_split_index.append(len(array_obs))
    list_split_index.insert(0, 0)
    list_subseq, list_subseq_cross,list_subseq_datetime = [], [], []
    for tmp_index in range(len(list_split_index) - 1):
        tmp_index_left = list_split_index[tmp_index] + 1
        tmp_index_right = list_split_index[tmp_index + 1]
        if tmp_index_right - tmp_index_left < 21:
            continue
        list_subseq.append(array_obs[tmp_index_left:tmp_index_right])
        list_subseq_datetime.append(array_datetime[tmp_index_left:tmp_index_right])
        list_subseq_cross.append(array_cross[tmp_index_left:tmp_index_right])
    
    list_p_AR = [2, 1, ]
    list_p_AR_cross = [1, ]
    list_p_AR.sort(reverse = True)
    max_p = max(max(list_p_AR), max(list_p_AR_cross) if len(list_p_AR_cross) != 0 else 0)

    list_X, list_Y, list_exog = [], [], []
    list_date = []
    for tmp_subseq, tmp_subseq_datetime, tmp_subseq_cross in zip(list_subseq, list_subseq_datetime, list_subseq_cross):
        X, Y, exog = [], [], []
        tmp_list_date = []
        for tmp_index_right in range(max_p, len(tmp_subseq) - horizon_forecast + 1):

            tmp_index_test = tmp_index_right - 1 + horizon_forecast # tmp_index_right == tmp_index_test if horizon_forecast = 1

            tmp_X = []
            if is_cross:
                for tmp_p in list_p_AR_cross:
                    tmp_X.extend(tmp_subseq_cross[tmp_index_right - tmp_p])
            for tmp_p in list_p_AR:
                tmp_X.append(tmp_subseq[tmp_index_right - tmp_p])
            X.append(tmp_X)
            Y.append([tmp_subseq[tmp_index_test]])

            tmp_date = tmp_subseq_datetime[tmp_index_test]
            tmp_list_date.append(tmp_date)
            
            tmp_t = pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).toordinal() - pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).replace(month = 1, day = 1).toordinal()
            if is_phase:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi), np.sin(tmp_t / 365 * 2 * np.pi)]
            else:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi)]
            exog.append(tmp_exog)

        list_X.append(X)
        list_Y.append(Y)
        list_exog.append(exog)
        list_date.append(tmp_list_date)
    
    print("Subseqs:")
    for tmp_index in range(len(list_subseq)):
        print("\t", tmp_index, pd.Timestamp(list_subseq_datetime[tmp_index][1]).date(), pd.Timestamp(list_subseq_datetime[tmp_index][-1]).date(), len(list_subseq[tmp_index]))
    print()
    
    # Prediction
    
    start_time = datetime.datetime.now()

    list_y_true, list_y_pred_naive = [], []
    list_y_pred_mslrx, list_y_pred_mslr = [], []
    list_y_pred_mslrxsolu3 = []
    list_y_pred_smap, list_y_pred_ar = [], []
    list_y_date = []
    list_y_seq, list_y_seq_index = [], []
    for cc in range(n_seq_warmup, len(list_X)):
        
        tmp_list_y_true, tmp_list_y_pred_naive = [], []
        tmp_list_y_pred_mslrx, tmp_list_y_pred_mslr = [], []
        tmp_list_y_pred_mslrxsolu3 = []
        tmp_list_y_pred_smap, tmp_list_y_pred_ar = [], []
        tmp_list_y_date = []
        tmp_list_y_seq, tmp_list_y_seq_index = [], []
        for tmp_index_test in range(5, len(list_X[cc])):
            tmp_list_X, tmp_list_Y, tmp_list_exog = [], [], []
            tmp_list_X_test = []
            for ii in range(cc):
                tmp_list_X.append(copy.deepcopy(list_X[ii]))
                tmp_list_Y.append(copy.deepcopy(list_Y[ii]))
                tmp_list_exog.append(copy.deepcopy(list_exog[ii]))
                tmp_list_X_test.append(copy.deepcopy(list_X[ii][-1]))
            tmp_list_X.append(copy.deepcopy(list_X[cc][:tmp_index_test]))
            tmp_list_Y.append(copy.deepcopy(list_Y[cc][:tmp_index_test]))
            tmp_list_exog.append(copy.deepcopy(list_exog[cc][:tmp_index_test]))
            tmp_list_X_test.append(copy.deepcopy(list_X[cc][tmp_index_test]))
            
            tmp_y_true = list_Y[cc][tmp_index_test]
            tmp_y_pred_naive = list_X[cc][tmp_index_test][-1]
            
            if is_mslr:

                mslrx = MSLRX(n_components = 2, covariance_type="diag", n_iter = 10, is_logistic_regression_CV = False,
                              logistic_regression_C = 1e10, is_logistic_regression_standardized = False, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslrx.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_exog, tmp_list_X_test, is_multiple_sequence = True)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslrx.append(tmp_y_pred)

                mslrx = MSLRXSoluIII(n_components = 2, covariance_type="diag", n_iter = 10, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslrx.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_exog, tmp_list_X_test, is_multiple_sequence = True)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslrxsolu3.append(tmp_y_pred)

                mslr = MSLR(n_components = 2, covariance_type="diag", n_iter = 10, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslr.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_X_test, is_multiple_sequence = True)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslr.append(tmp_y_pred)
                
            else:
                tmp_list_y_pred_mslrx.append(0.)
                tmp_list_y_pred_mslrxsolu3.append(0.)
                tmp_list_y_pred_mslr.append(0.)
            
            lr = reg_method_func(**kargs_reg)
            lr.fit(np.concatenate(tmp_list_X), np.concatenate(tmp_list_Y).flatten())
            tmp_pred = lr.predict(tmp_list_X_test)
            tmp_y_pred = tmp_pred[-1]
            tmp_list_y_pred_ar.append(tmp_y_pred)
            
            if is_smap_cv:
                lr = SMapCV(thetas = (0.5, 1.0, 1.5, ), reg_method = reg_method, kargs_reg = kargs_reg)
            else:
                lr = SMap(theta = 0.5, reg_method = reg_method, kargs_reg = kargs_reg)
            lr.fit(np.concatenate(tmp_list_X), np.concatenate(tmp_list_Y).flatten())
            tmp_pred = lr.predict(tmp_list_X_test)
            tmp_y_pred = tmp_pred[-1]
            tmp_list_y_pred_smap.append(tmp_y_pred)

            tmp_list_y_true.append(tmp_y_true)
            tmp_list_y_pred_naive.append(tmp_y_pred_naive)
            tmp_list_y_date.append(list_date[cc][tmp_index_test])
            tmp_list_y_seq.append(cc)
            tmp_list_y_seq_index.append(tmp_index_test)
        
        list_y_date.append(tmp_list_y_date)
        list_y_seq.append(tmp_list_y_seq)
        list_y_seq_index.append(tmp_list_y_seq_index)
        if is_log:
            list_y_true.append(np.power(10, tmp_list_y_true))
            list_y_pred_naive.append(np.power(10, tmp_list_y_pred_naive))
            list_y_pred_mslrx.append(np.power(10, tmp_list_y_pred_mslrx))
            list_y_pred_mslr.append(np.power(10, tmp_list_y_pred_mslr))
            list_y_pred_mslrxsolu3.append(np.power(10, tmp_list_y_pred_mslrxsolu3))
            list_y_pred_ar.append(np.power(10, tmp_list_y_pred_ar))
            list_y_pred_smap.append(np.power(10, tmp_list_y_pred_smap))
        else:
            list_y_true.append(tmp_list_y_true)
            list_y_pred_naive.append(tmp_list_y_pred_naive)
            list_y_pred_mslrx.append(tmp_list_y_pred_mslrx)
            list_y_pred_mslr.append(tmp_list_y_pred_mslr)
            list_y_pred_mslrxsolu3.append(tmp_list_y_pred_mslrxsolu3)
            list_y_pred_ar.append(tmp_list_y_pred_ar)
            list_y_pred_smap.append(tmp_list_y_pred_smap)
        
        print("seq % s finished" % cc, str(datetime.datetime.now() - start_time))
        
    list_y_date, list_y_true, list_y_pred_naive = np.concatenate(list_y_date).flatten(), np.concatenate(list_y_true).flatten(), np.concatenate(list_y_pred_naive).flatten()
    list_y_pred_mslrx, list_y_pred_mslr, list_y_pred_mslrxsolu3, list_y_pred_ar, list_y_pred_smap = np.concatenate(list_y_pred_mslrx).flatten(), np.concatenate(list_y_pred_mslr).flatten(), np.concatenate(list_y_pred_mslrxsolu3).flatten(), np.concatenate(list_y_pred_ar).flatten(), np.concatenate(list_y_pred_smap).flatten() 
    list_y_seq, list_y_seq_index = np.concatenate(list_y_seq).flatten(), np.concatenate(list_y_seq_index).flatten()
    
    dict_method_pred = {"date": list_y_date, 
                        "true": list_y_true, 
                        "pred_naive": list_y_pred_naive, 
                        "pred_mslrx": list_y_pred_mslrx,
                        "pred_mslr": list_y_pred_mslr,
                        "pred_mslrxsolu3": list_y_pred_mslrxsolu3, 
                        "pred_ar": list_y_pred_ar,
                        "pred_smap": list_y_pred_smap,
                        "seq": list_y_seq,
                        "seq_index": list_y_seq_index,
                       }
                
    return pd.DataFrame(dict_method_pred)


In [24]:

is_log = True
is_phase = True

dict_setting_results = dict()

#for is_cross in [True, False]:
for is_cross in [True, ]:
    for horizon_forecast in [1, 3, 7]:
        for feature_removed in [None, "salinity_max", "salinity_min", "ODO", "turbidity", "pH", "temperature", "wind"]:
                
                tmp_setting = (is_log, is_cross, is_phase, horizon_forecast, "LinearRegression", )
                
                print()
                print("================================================")
                print("setting =", tmp_setting)
                print("removed feature =", feature_removed)
                print("================================================")
                
                tmp_res_df = _main(data_WW_byday, n_seq_warmup=1, 
                                  #reg_method = "LinearSVR", reg_method_func = LinearSVR, kargs_reg = {"random_state": 434},
                                  reg_method = "LinearRegression", reg_method_func = LinearRegression, kargs_reg = {},
                                  is_log = is_log, is_cross = is_cross, is_phase = is_phase, horizon_forecast = horizon_forecast, is_mslr = True,
                                   list_unused_feature = [feature_removed] if feature_removed is not None else None,
                                  )
                
                dict_setting_results[tmp_setting] = tmp_res_df
                
                tail_log = "_log" if is_log else ""
                tail_cross = "_X" if is_cross else ""
                tail_horizon = "% sD" % horizon_forecast
                tail_method = "LR"
                
                tail_removed_features = "EX" + feature_removed.replace("_", "").lower() if feature_removed is not None else "allX" 
                
                #tail_max = "2ndmax"
                tail_max = "1stmax"
                #tail_max = "90qmax"
                
                fname_write = "vimsWW_% s_pred_% s_% s_% s% s% s.csv" % (tail_max, tail_removed_features, tail_method, tail_horizon, tail_cross, tail_log, )
                tmp_res_df.to_csv(os.path.join("res", "vimsWW_featureimportance", fname_write), index = False)



setting = (True, True, True, 1, 'LinearRegression')
removed feature = None
Subseqs:
	 0 2018-11-01 2019-01-22 84
	 1 2019-02-02 2019-03-24 52
	 2 2019-03-27 2019-06-24 91
	 3 2019-06-27 2019-07-25 30
	 4 2019-08-02 2019-11-16 108
	 5 2019-11-20 2020-02-12 86
	 6 2020-02-15 2020-03-27 43
	 7 2020-03-31 2020-07-16 109
	 8 2020-09-04 2020-09-25 23
	 9 2021-02-09 2021-03-07 28
	 10 2021-04-01 2021-06-28 90
	 11 2021-07-01 2021-07-21 22
	 12 2021-07-29 2021-08-31 35
	 13 2021-09-03 2021-10-27 56
	 14 2021-10-30 2022-01-01 65
	 15 2022-01-15 2022-03-26 72
	 16 2022-03-29 2022-07-23 118
	 17 2022-07-28 2022-12-09 136

seq 1 finished 0:01:44.237309


  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 2 finished 0:05:42.286308
seq 3 finished 0:07:00.934309
seq 4 finished 0:14:10.341309
seq 5 finished 0:21:19.215309
seq 6 finished 0:25:25.734140
seq 7 finished 0:38:11.000998
seq 8 finished 0:40:17.266973
seq 9 finished 0:43:06.519341
seq 10 finished 0:55:07.113049
seq 11 finished 0:57:14.120008
seq 12 finished 1:01:11.517469
seq 13 finished 1:08:19.956818
seq 14 finished 1:17:11.778292
seq 15 finished 1:27:29.762296
seq 16 finished 1:46:52.252374
seq 17 finished 2:13:26.268967

setting = (True, True, True, 1, 'LinearRegression')
removed feature = salinity_max
Subseqs:
	 0 2018-11-01 2019-01-22 84
	 1 2019-02-02 2019-03-24 52
	 2 2019-03-27 2019-06-24 91
	 3 2019-06-27 2019-07-25 30
	 4 2019-08-02 2019-11-16 108
	 5 2019-11-20 2020-02-12 86
	 6 2020-02-15 2020-03-27 43
	 7 2020-03-31 2020-07-16 109
	 8 2020-09-04 2020-09-25 23
	 9 2021-02-09 2021-03-07 28
	 10 2021-04-01 2021-06-28 90
	 11 2021-07-01 2021-07-21 22
	 12 2021-07-29 2021-08-31 35
	 13 2021-09-03 2021-10-27 56
	 14 20

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 2 finished 0:05:55.828527
seq 3 finished 0:07:20.492847
seq 4 finished 0:14:53.526225
seq 5 finished 0:22:30.046362
seq 6 finished 0:26:22.172199
seq 7 finished 0:38:13.321666
seq 8 finished 0:40:23.421807
seq 9 finished 0:43:24.426914
seq 10 finished 0:56:01.232976
seq 11 finished 0:58:29.251857
seq 12 finished 1:03:15.844923
seq 13 finished 1:12:01.820738
seq 14 finished 1:22:52.208365
seq 15 finished 1:35:08.461683
seq 16 finished 1:58:57.397089
seq 17 finished 2:31:22.743531

setting = (True, True, True, 1, 'LinearRegression')
removed feature = salinity_min
Subseqs:
	 0 2018-11-01 2019-01-22 84
	 1 2019-02-02 2019-03-24 52
	 2 2019-03-27 2019-06-24 91
	 3 2019-06-27 2019-07-25 30
	 4 2019-08-02 2019-11-16 108
	 5 2019-11-20 2020-02-12 86
	 6 2020-02-15 2020-03-27 43
	 7 2020-03-31 2020-07-16 109
	 8 2020-09-04 2020-09-25 23
	 9 2021-02-09 2021-03-07 28
	 10 2021-04-01 2021-06-28 90
	 11 2021-07-01 2021-07-21 22
	 12 2021-07-29 2021-08-31 35
	 13 2021-09-03 2021-10-27 56
	 14 20

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 2 finished 0:06:26.228198
seq 3 finished 0:07:59.403962
seq 4 finished 0:16:01.043804
seq 5 finished 0:23:23.023506
seq 6 finished 0:27:27.080439
seq 7 finished 0:40:02.628953
seq 8 finished 0:42:07.722953
seq 9 finished 0:44:56.446954
seq 10 finished 0:56:58.010537
seq 11 finished 0:59:16.987868
seq 12 finished 1:03:51.483145
seq 13 finished 1:12:09.395448
seq 14 finished 1:22:37.742447
seq 15 finished 1:35:07.781447
seq 16 finished 1:58:15.784447
seq 17 finished 2:28:29.201447

setting = (True, True, True, 1, 'LinearRegression')
removed feature = ODO
Subseqs:
	 0 2018-11-01 2019-01-22 84
	 1 2019-02-02 2019-03-24 52
	 2 2019-03-27 2019-06-24 91
	 3 2019-06-27 2019-07-25 30
	 4 2019-08-02 2019-11-16 108
	 5 2019-11-20 2020-02-12 86
	 6 2020-02-15 2020-03-27 43
	 7 2020-03-31 2020-07-16 109
	 8 2020-09-04 2020-09-25 23
	 9 2021-02-09 2021-03-07 28
	 10 2021-04-01 2021-06-28 90
	 11 2021-07-01 2021-07-21 22
	 12 2021-07-29 2021-08-31 35
	 13 2021-09-03 2021-10-27 56
	 14 2021-10-30 

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 2 finished 0:05:43.703000
seq 3 finished 0:07:06.434000
seq 4 finished 0:14:25.554000
seq 5 finished 0:21:41.785001
seq 6 finished 0:25:35.518512
seq 7 finished 0:37:37.004513
seq 8 finished 0:39:41.307511
seq 9 finished 0:42:29.942512
seq 10 finished 0:54:30.585513
seq 11 finished 0:56:51.151512
seq 12 finished 1:01:22.014511
seq 13 finished 1:09:40.611511
seq 14 finished 1:20:08.416511
seq 15 finished 1:32:39.743512
seq 16 finished 1:56:03.746511
seq 17 finished 2:26:20.084511

setting = (True, True, True, 1, 'LinearRegression')
removed feature = temperature
Subseqs:
	 0 2018-11-01 2019-01-22 84
	 1 2019-02-02 2019-03-24 52
	 2 2019-03-27 2019-06-24 91
	 3 2019-06-27 2019-07-25 30
	 4 2019-08-02 2019-11-16 108
	 5 2019-11-20 2020-02-12 86
	 6 2020-02-15 2020-03-27 43
	 7 2020-03-31 2020-07-16 109
	 8 2020-09-04 2020-09-25 23
	 9 2021-02-09 2021-03-07 28
	 10 2021-04-01 2021-06-28 90
	 11 2021-07-01 2021-07-21 22
	 12 2021-07-29 2021-08-31 35
	 13 2021-09-03 2021-10-27 56
	 14 202

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:01:23.783790
seq 2 finished 0:05:19.908212
seq 3 finished 0:06:19.046559
seq 4 finished 0:12:23.230699
seq 5 finished 0:18:32.608532
seq 6 finished 0:21:20.817757
seq 7 finished 0:31:08.813029
seq 8 finished 0:32:13.180216
seq 9 finished 0:33:52.431022
seq 10 finished 0:43:03.685970
seq 11 finished 0:44:12.454726
seq 12 finished 0:47:03.412129
seq 13 finished 0:52:49.856604
seq 14 finished 1:00:12.551183
seq 15 finished 1:08:54.082890
seq 16 finished 1:25:22.367326
seq 17 finished 1:48:15.146212

setting = (True, True, True, 7, 'LinearRegression')
removed feature = turbidity
Subseqs:
	 0 2018-11-01 2019-01-22 84
	 1 2019-02-02 2019-03-24 52
	 2 2019-03-27 2019-06-24 91
	 3 2019-06-27 2019-07-25 30
	 4 2019-08-02 2019-11-16 108
	 5 2019-11-20 2020-02-12 86
	 6 2020-02-15 2020-03-27 43
	 7 2020-03-31 2020-07-16 109
	 8 2020-09-04 2020-09-25 23
	 9 2021-02-09 2021-03-07 28
	 10 2021-04-01 2021-06-28 90
	 11 2021-07-01 2021-07-21 22
	 12 2021-07-29 2021-08-31 35
	 13 2021-

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:01:28.003038
seq 2 finished 0:05:21.408230
seq 3 finished 0:06:24.980087
seq 4 finished 0:12:36.226363
seq 5 finished 0:18:44.596265
seq 6 finished 0:21:33.120795
seq 7 finished 0:31:45.245578
seq 8 finished 0:32:51.289041
seq 9 finished 0:34:35.075365
seq 10 finished 0:44:18.921943
seq 11 finished 0:45:29.168943
seq 12 finished 0:48:29.178554
seq 13 finished 0:54:41.475639
seq 14 finished 1:02:29.540989
seq 15 finished 1:11:49.356984
seq 16 finished 1:28:14.996741
seq 17 finished 1:54:16.135413

setting = (True, True, True, 7, 'LinearRegression')
removed feature = pH
Subseqs:
	 0 2018-11-01 2019-01-22 84
	 1 2019-02-02 2019-03-24 52
	 2 2019-03-27 2019-06-24 91
	 3 2019-06-27 2019-07-25 30
	 4 2019-08-02 2019-11-16 108
	 5 2019-11-20 2020-02-12 86
	 6 2020-02-15 2020-03-27 43
	 7 2020-03-31 2020-07-16 109
	 8 2020-09-04 2020-09-25 23
	 9 2021-02-09 2021-03-07 28
	 10 2021-04-01 2021-06-28 90
	 11 2021-07-01 2021-07-21 22
	 12 2021-07-29 2021-08-31 35
	 13 2021-09-03 2

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 1 finished 0:01:35.507001
seq 2 finished 0:06:06.435000
seq 3 finished 0:07:18.239000
seq 4 finished 0:14:05.179000
seq 5 finished 0:21:00.062000
seq 6 finished 0:24:10.578002
seq 7 finished 0:35:07.277000
seq 8 finished 0:36:20.342000
seq 9 finished 0:38:12.973001
seq 10 finished 0:48:38.277001
seq 11 finished 0:49:56.169000
seq 12 finished 0:53:11.522000
seq 13 finished 0:59:51.319001
seq 14 finished 1:08:25.520001
seq 15 finished 1:18:54.432000
seq 16 finished 1:39:14.936000
seq 17 finished 2:06:16.654000


In [None]:
def _main(data_byday, 
          is_log = True, is_cross = True, is_phase = True, horizon_forecast = 1, thres_quantile = 0.95, 
          n_seq_warmup = 6, is_mslr = True, is_smap_cv = False, list_unused_feature = None,
          #reg_method = "LinearSVR", reg_method_func = LinearSVR, kargs_reg = {"random_state": 434},
          reg_method = "LinearRegression", reg_method_func = LinearRegression, kargs_reg = {},
         ):
    
    list_features = ["salinity_max", "salinity_min", "ODO", "turbidity", "pH", "temperature", "wind"]
    if list_unused_feature is not None:
        for tmp_feature in list_unused_feature:
            list_features.remove(tmp_feature)
            
    if is_log:
        array_obs = data_byday["log10_chlorophyll"].to_numpy()
    else:
        array_obs = data_byday["chlorophyll"].to_numpy()

    array_datetime = data_byday["date"].to_numpy()
    array_cross = data_byday[list_features].to_numpy()
    
    list_split_index = list(np.where(np.isnan(array_obs))[0])
    list_split_index.append(len(array_obs))
    list_split_index.insert(0, 0)
    list_subseq, list_subseq_cross,list_subseq_datetime = [], [], []
    for tmp_index in range(len(list_split_index) - 1):
        tmp_index_left = list_split_index[tmp_index] + 1
        tmp_index_right = list_split_index[tmp_index + 1]
        if tmp_index_right - tmp_index_left < 21:
            continue
        list_subseq.append(array_obs[tmp_index_left:tmp_index_right])
        list_subseq_datetime.append(array_datetime[tmp_index_left:tmp_index_right])
        list_subseq_cross.append(array_cross[tmp_index_left:tmp_index_right])
    
    list_p_AR = [2, 1, ]
    list_p_AR_cross = [1, ]
    list_p_AR.sort(reverse = True)
    max_p = max(max(list_p_AR), max(list_p_AR_cross) if len(list_p_AR_cross) != 0 else 0)

    list_X, list_Y, list_exog = [], [], []
    list_date = []
    for tmp_subseq, tmp_subseq_datetime, tmp_subseq_cross in zip(list_subseq, list_subseq_datetime, list_subseq_cross):
        X, Y, exog = [], [], []
        tmp_list_date = []
        for tmp_index_right in range(max_p, len(tmp_subseq) - horizon_forecast + 1):

            tmp_index_test = tmp_index_right - 1 + horizon_forecast # tmp_index_right == tmp_index_test if horizon_forecast = 1

            tmp_X = []
            if is_cross:
                for tmp_p in list_p_AR_cross:
                    tmp_X.extend(tmp_subseq_cross[tmp_index_right - tmp_p])
            for tmp_p in list_p_AR:
                tmp_X.append(tmp_subseq[tmp_index_right - tmp_p])
            X.append(tmp_X)
            Y.append([tmp_subseq[tmp_index_test]])

            tmp_date = tmp_subseq_datetime[tmp_index_test]
            tmp_list_date.append(tmp_date)
            
            tmp_t = pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).toordinal() - pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).replace(month = 1, day = 1).toordinal()
            if is_phase:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi), np.sin(tmp_t / 365 * 2 * np.pi)]
            else:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi)]
            exog.append(tmp_exog)

        list_X.append(X)
        list_Y.append(Y)
        list_exog.append(exog)
        list_date.append(tmp_list_date)
    
    print("Subseqs:")
    for tmp_index in range(len(list_subseq)):
        print("\t", tmp_index, pd.Timestamp(list_subseq_datetime[tmp_index][1]).date(), pd.Timestamp(list_subseq_datetime[tmp_index][-1]).date(), len(list_subseq[tmp_index]))
    print()
    
    # Prediction
    
    start_time = datetime.datetime.now()

    list_y_true, list_y_pred_naive = [], []
    list_y_pred_mslrx, list_y_pred_mslr = [], []
    list_y_pred_mslrxsolu3 = []
    list_y_pred_smap, list_y_pred_ar = [], []
    list_y_date = []
    list_y_seq, list_y_seq_index = [], []
    for cc in range(n_seq_warmup, len(list_X)):
        
        tmp_list_y_true, tmp_list_y_pred_naive = [], []
        tmp_list_y_pred_mslrx, tmp_list_y_pred_mslr = [], []
        tmp_list_y_pred_mslrxsolu3 = []
        tmp_list_y_pred_smap, tmp_list_y_pred_ar = [], []
        tmp_list_y_date = []
        tmp_list_y_seq, tmp_list_y_seq_index = [], []
        for tmp_index_test in range(5, len(list_X[cc])):
            tmp_list_X, tmp_list_Y, tmp_list_exog = [], [], []
            tmp_list_X_test = []
            tmp_list_exog_additional = []
            for ii in range(cc):
                tmp_list_X.append(copy.deepcopy(list_X[ii]))
                tmp_list_Y.append(copy.deepcopy(list_Y[ii]))
                tmp_list_exog.append(copy.deepcopy(list_exog[ii]))
                tmp_list_X_test.append(copy.deepcopy(list_X[ii][-1]))
                if horizon_forecast != 1:
                    tmp_list_exog_additional.append(copy.deepcopy(list_exog[ii][(-horizon_forecast+1):]))
            tmp_list_X.append(copy.deepcopy(list_X[cc][:(tmp_index_test-horizon_forecast+1)]))
            tmp_list_Y.append(copy.deepcopy(list_Y[cc][:(tmp_index_test-horizon_forecast+1)]))
            tmp_list_exog.append(copy.deepcopy(list_exog[cc][:(tmp_index_test-horizon_forecast+1)]))
            tmp_list_X_test.append(copy.deepcopy(list_X[cc][tmp_index_test]))
            if horizon_forecast != 1:
                tmp_list_exog_additional.append(copy.deepcopy(list_exog[ii][(tmp_index_test-horizon_forecast+1):tmp_index_test]))
            
            tmp_y_true = list_Y[cc][tmp_index_test]
            tmp_y_pred_naive = list_X[cc][tmp_index_test][-1]
            
            if horizon_forecast ==  1:
                tmp_list_exog_additional = None
            
            if is_mslr:

                mslrx = MSLRX(n_components = 2, covariance_type="diag", n_iter = 10, is_logistic_regression_CV = False,
                              logistic_regression_C = 1e10, is_logistic_regression_standardized = False, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslrx.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_exog, tmp_list_X_test, is_multiple_sequence = True, exog_additional=tmp_list_exog_additional)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslrx.append(tmp_y_pred)

                mslrx = MSLRXSoluIII(n_components = 2, covariance_type="diag", n_iter = 10, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslrx.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_exog, tmp_list_X_test, is_multiple_sequence = True, exog_additional=tmp_list_exog_additional)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslrxsolu3.append(tmp_y_pred)

                mslr = MSLR(n_components = 2, covariance_type="diag", n_iter = 10, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslr.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_X_test, is_multiple_sequence = True, forecast_horizon=horizon_forecast)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslr.append(tmp_y_pred)
                
            else:
                tmp_list_y_pred_mslrx.append(0.)
                tmp_list_y_pred_mslrxsolu3.append(0.)
                tmp_list_y_pred_mslr.append(0.)
            
            lr = reg_method_func(**kargs_reg)
            lr.fit(np.concatenate(tmp_list_X), np.concatenate(tmp_list_Y).flatten())
            tmp_pred = lr.predict(tmp_list_X_test)
            tmp_y_pred = tmp_pred[-1]
            tmp_list_y_pred_ar.append(tmp_y_pred)
            
            if is_smap_cv:
                lr = SMapCV(thetas = (0.5, 1.0, 1.5, ), reg_method = reg_method, kargs_reg = kargs_reg)
            else:
                lr = SMap(theta = 0.5, reg_method = reg_method, kargs_reg = kargs_reg)
            lr.fit(np.concatenate(tmp_list_X), np.concatenate(tmp_list_Y).flatten())
            tmp_pred = lr.predict(tmp_list_X_test)
            tmp_y_pred = tmp_pred[-1]
            tmp_list_y_pred_smap.append(tmp_y_pred)

            tmp_list_y_true.append(tmp_y_true)
            tmp_list_y_pred_naive.append(tmp_y_pred_naive)
            tmp_list_y_date.append(list_date[cc][tmp_index_test])
            tmp_list_y_seq.append(cc)
            tmp_list_y_seq_index.append(tmp_index_test)
        
        list_y_date.append(tmp_list_y_date)
        list_y_seq.append(tmp_list_y_seq)
        list_y_seq_index.append(tmp_list_y_seq_index)
        if is_log:
            list_y_true.append(np.power(10, tmp_list_y_true))
            list_y_pred_naive.append(np.power(10, tmp_list_y_pred_naive))
            list_y_pred_mslrx.append(np.power(10, tmp_list_y_pred_mslrx))
            list_y_pred_mslr.append(np.power(10, tmp_list_y_pred_mslr))
            list_y_pred_mslrxsolu3.append(np.power(10, tmp_list_y_pred_mslrxsolu3))
            list_y_pred_ar.append(np.power(10, tmp_list_y_pred_ar))
            list_y_pred_smap.append(np.power(10, tmp_list_y_pred_smap))
        else:
            list_y_true.append(tmp_list_y_true)
            list_y_pred_naive.append(tmp_list_y_pred_naive)
            list_y_pred_mslrx.append(tmp_list_y_pred_mslrx)
            list_y_pred_mslr.append(tmp_list_y_pred_mslr)
            list_y_pred_mslrxsolu3.append(tmp_list_y_pred_mslrxsolu3)
            list_y_pred_ar.append(tmp_list_y_pred_ar)
            list_y_pred_smap.append(tmp_list_y_pred_smap)
        
        print("seq % s finished" % cc, str(datetime.datetime.now() - start_time))
        
    list_y_date, list_y_true, list_y_pred_naive = np.concatenate(list_y_date).flatten(), np.concatenate(list_y_true).flatten(), np.concatenate(list_y_pred_naive).flatten()
    list_y_pred_mslrx, list_y_pred_mslr, list_y_pred_mslrxsolu3, list_y_pred_ar, list_y_pred_smap = np.concatenate(list_y_pred_mslrx).flatten(), np.concatenate(list_y_pred_mslr).flatten(), np.concatenate(list_y_pred_mslrxsolu3).flatten(), np.concatenate(list_y_pred_ar).flatten(), np.concatenate(list_y_pred_smap).flatten() 
    list_y_seq, list_y_seq_index = np.concatenate(list_y_seq).flatten(), np.concatenate(list_y_seq_index).flatten()
    
    dict_method_pred = {"date": list_y_date, 
                        "true": list_y_true, 
                        "pred_naive": list_y_pred_naive, 
                        "pred_mslrx": list_y_pred_mslrx,
                        "pred_mslr": list_y_pred_mslr,
                        "pred_mslrxsolu3": list_y_pred_mslrxsolu3, 
                        "pred_ar": list_y_pred_ar,
                        "pred_smap": list_y_pred_smap,
                        "seq": list_y_seq,
                        "seq_index": list_y_seq_index,
                       }
                
    return pd.DataFrame(dict_method_pred)


In [None]:

is_log = True
is_phase = True

dict_setting_results = dict()

#for is_cross in [True, False]:
for is_cross in [True, ]:
    for horizon_forecast in [3, 7]:
        #for feature_removed in [None, "salinity_max", "salinity_min", "ODO", "turbidity", "pH", "temperature", "wind"]:
        for feature_removed in [None,]:
                
                tmp_setting = (is_log, is_cross, is_phase, horizon_forecast, "LinearRegression", )
                
                print()
                print("================================================")
                print("setting =", tmp_setting)
                print("removed feature =", feature_removed)
                print("================================================")
                
                tmp_res_df = _main(data_WW_byday, n_seq_warmup=1, 
                                  #reg_method = "LinearSVR", reg_method_func = LinearSVR, kargs_reg = {"random_state": 434},
                                  reg_method = "LinearRegression", reg_method_func = LinearRegression, kargs_reg = {},
                                  is_log = is_log, is_cross = is_cross, is_phase = is_phase, horizon_forecast = horizon_forecast, is_mslr = True,
                                   list_unused_feature = [feature_removed] if feature_removed is not None else None,
                                  )
                
                dict_setting_results[tmp_setting] = tmp_res_df
                
                tail_log = "_log" if is_log else ""
                tail_cross = "_X" if is_cross else ""
                tail_horizon = "% sD" % horizon_forecast
                tail_method = "LR"
                
                tail_removed_features = "EX" + feature_removed.replace("_", "").lower() if feature_removed is not None else "allX" 
                
                #tail_max = "2ndmax"
                tail_max = "1stmax"
                #tail_max = "90qmax"
                
                fname_write = "vimsWW_% s_pred_% s_% s_% s% s% s.csv" % (tail_max, tail_removed_features, tail_method, tail_horizon, tail_cross, tail_log, )
                tmp_res_df.to_csv(os.path.join("res", "vimsWW_pred_truehorizon", fname_write), index = False)


In [32]:
def _main(data_byday, 
          is_log = True, is_cross = True, is_phase = True, horizon_forecast = 1, thres_quantile = 0.95, 
          n_seq_warmup = 6, is_mslr = True, is_smap_cv = False, list_unused_feature = None,
          #reg_method = "LinearSVR", reg_method_func = LinearSVR, kargs_reg = {"random_state": 434},
          reg_method = "LinearRegression", reg_method_func = LinearRegression, kargs_reg = {},
         ):
    
    list_features = ["salinity_max", "salinity_min", "ODO", "turbidity", "pH", "temperature", "wind"]
    if list_unused_feature is not None:
        for tmp_feature in list_unused_feature:
            list_features.remove(tmp_feature)
            
    if is_log:
        array_obs = data_byday["log10_chlorophyll"].to_numpy()
    else:
        array_obs = data_byday["chlorophyll"].to_numpy()

    array_datetime = data_byday["date"].to_numpy()
    array_cross = data_byday[list_features].to_numpy()
    
    list_split_index = list(np.where(np.isnan(array_obs))[0])
    list_split_index.append(len(array_obs))
    list_split_index.insert(0, 0)
    list_subseq, list_subseq_cross,list_subseq_datetime = [], [], []
    for tmp_index in range(len(list_split_index) - 1):
        tmp_index_left = list_split_index[tmp_index] + 1
        tmp_index_right = list_split_index[tmp_index + 1]
        if tmp_index_right - tmp_index_left < 21:
            continue
        list_subseq.append(array_obs[tmp_index_left:tmp_index_right])
        list_subseq_datetime.append(array_datetime[tmp_index_left:tmp_index_right])
        list_subseq_cross.append(array_cross[tmp_index_left:tmp_index_right])
    
    list_p_AR = [5, 4, 3, 2, 1, ]
    list_p_AR_cross = [3, 2, 1, ]
    list_p_AR.sort(reverse = True)
    max_p = max(max(list_p_AR), max(list_p_AR_cross) if len(list_p_AR_cross) != 0 else 0)

    list_X, list_Y, list_exog = [], [], []
    list_date = []
    for tmp_subseq, tmp_subseq_datetime, tmp_subseq_cross in zip(list_subseq, list_subseq_datetime, list_subseq_cross):
        X, Y, exog = [], [], []
        tmp_list_date = []
        for tmp_index_right in range(max_p, len(tmp_subseq) - horizon_forecast + 1):

            tmp_index_test = tmp_index_right - 1 + horizon_forecast # tmp_index_right == tmp_index_test if horizon_forecast = 1

            tmp_X = []
            if is_cross:
                for tmp_p in list_p_AR_cross:
                    tmp_X.extend(tmp_subseq_cross[tmp_index_right - tmp_p])
            for tmp_p in list_p_AR:
                tmp_X.append(tmp_subseq[tmp_index_right - tmp_p])
            X.append(tmp_X)
            Y.append([tmp_subseq[tmp_index_test]])

            tmp_date = tmp_subseq_datetime[tmp_index_test]
            tmp_list_date.append(tmp_date)
            
            tmp_t = pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).toordinal() - pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).replace(month = 1, day = 1).toordinal()
            if is_phase:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi), np.sin(tmp_t / 365 * 2 * np.pi)]
            else:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi)]
            exog.append(tmp_exog)

        list_X.append(X)
        list_Y.append(Y)
        list_exog.append(exog)
        list_date.append(tmp_list_date)
    
    print("Subseqs:")
    for tmp_index in range(len(list_subseq)):
        print("\t", tmp_index, pd.Timestamp(list_subseq_datetime[tmp_index][1]).date(), pd.Timestamp(list_subseq_datetime[tmp_index][-1]).date(), len(list_subseq[tmp_index]))
    print()
    
    # Prediction
    
    start_time = datetime.datetime.now()

    list_y_true, list_y_pred_naive = [], []
    list_y_pred_mslrx, list_y_pred_mslr = [], []
    list_y_pred_mslrxsolu3 = []
    list_y_pred_smap, list_y_pred_ar = [], []
    list_y_date = []
    list_y_seq, list_y_seq_index = [], []
    for cc in range(n_seq_warmup, len(list_X)):
        
        tmp_list_y_true, tmp_list_y_pred_naive = [], []
        tmp_list_y_pred_mslrx, tmp_list_y_pred_mslr = [], []
        tmp_list_y_pred_mslrxsolu3 = []
        tmp_list_y_pred_smap, tmp_list_y_pred_ar = [], []
        tmp_list_y_date = []
        tmp_list_y_seq, tmp_list_y_seq_index = [], []
        for tmp_index_test in range(5, len(list_X[cc])):
            tmp_list_X, tmp_list_Y, tmp_list_exog = [], [], []
            tmp_list_X_test = []
            tmp_list_exog_additional = []
            for ii in range(cc):
                tmp_list_X.append(copy.deepcopy(list_X[ii]))
                tmp_list_Y.append(copy.deepcopy(list_Y[ii]))
                tmp_list_exog.append(copy.deepcopy(list_exog[ii]))
                tmp_list_X_test.append(copy.deepcopy(list_X[ii][-1]))
                if horizon_forecast != 1:
                    tmp_list_exog_additional.append(copy.deepcopy(list_exog[ii][(-horizon_forecast+1):]))
            tmp_list_X.append(copy.deepcopy(list_X[cc][:(tmp_index_test-horizon_forecast+1)]))
            tmp_list_Y.append(copy.deepcopy(list_Y[cc][:(tmp_index_test-horizon_forecast+1)]))
            tmp_list_exog.append(copy.deepcopy(list_exog[cc][:(tmp_index_test-horizon_forecast+1)]))
            tmp_list_X_test.append(copy.deepcopy(list_X[cc][tmp_index_test]))
            if horizon_forecast != 1:
                tmp_list_exog_additional.append(copy.deepcopy(list_exog[ii][(tmp_index_test-horizon_forecast+1):tmp_index_test]))
            
            tmp_y_true = list_Y[cc][tmp_index_test]
            tmp_y_pred_naive = list_X[cc][tmp_index_test][-1]
            
            if horizon_forecast ==  1:
                tmp_list_exog_additional = None
            
            if is_mslr:

                mslrx = MSLRX(n_components = 2, covariance_type="diag", n_iter = 10, is_logistic_regression_CV = False,
                              logistic_regression_C = 1e10, is_logistic_regression_standardized = False, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslrx.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_exog, tmp_list_X_test, is_multiple_sequence = True, exog_additional=tmp_list_exog_additional)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslrx.append(tmp_y_pred)

                mslrx = MSLRXSoluIII(n_components = 2, covariance_type="diag", n_iter = 10, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslrx.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_exog, tmp_list_X_test, is_multiple_sequence = True, exog_additional=tmp_list_exog_additional)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslrxsolu3.append(tmp_y_pred)

                mslr = MSLR(n_components = 2, covariance_type="diag", n_iter = 10, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslr.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_X_test, is_multiple_sequence = True, forecast_horizon=horizon_forecast)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslr.append(tmp_y_pred)
                
            else:
                tmp_list_y_pred_mslrx.append(0.)
                tmp_list_y_pred_mslrxsolu3.append(0.)
                tmp_list_y_pred_mslr.append(0.)
            
            lr = reg_method_func(**kargs_reg)
            lr.fit(np.concatenate(tmp_list_X), np.concatenate(tmp_list_Y).flatten())
            tmp_pred = lr.predict(tmp_list_X_test)
            tmp_y_pred = tmp_pred[-1]
            tmp_list_y_pred_ar.append(tmp_y_pred)
            
            if is_smap_cv:
                lr = SMapCV(thetas = (0.5, 1.0, 1.5, ), reg_method = reg_method, kargs_reg = kargs_reg)
            else:
                lr = SMap(theta = 0.5, reg_method = reg_method, kargs_reg = kargs_reg)
            lr.fit(np.concatenate(tmp_list_X), np.concatenate(tmp_list_Y).flatten())
            tmp_pred = lr.predict(tmp_list_X_test)
            tmp_y_pred = tmp_pred[-1]
            tmp_list_y_pred_smap.append(tmp_y_pred)

            tmp_list_y_true.append(tmp_y_true)
            tmp_list_y_pred_naive.append(tmp_y_pred_naive)
            tmp_list_y_date.append(list_date[cc][tmp_index_test])
            tmp_list_y_seq.append(cc)
            tmp_list_y_seq_index.append(tmp_index_test)
        
        list_y_date.append(tmp_list_y_date)
        list_y_seq.append(tmp_list_y_seq)
        list_y_seq_index.append(tmp_list_y_seq_index)
        if is_log:
            list_y_true.append(np.power(10, tmp_list_y_true))
            list_y_pred_naive.append(np.power(10, tmp_list_y_pred_naive))
            list_y_pred_mslrx.append(np.power(10, tmp_list_y_pred_mslrx))
            list_y_pred_mslr.append(np.power(10, tmp_list_y_pred_mslr))
            list_y_pred_mslrxsolu3.append(np.power(10, tmp_list_y_pred_mslrxsolu3))
            list_y_pred_ar.append(np.power(10, tmp_list_y_pred_ar))
            list_y_pred_smap.append(np.power(10, tmp_list_y_pred_smap))
        else:
            list_y_true.append(tmp_list_y_true)
            list_y_pred_naive.append(tmp_list_y_pred_naive)
            list_y_pred_mslrx.append(tmp_list_y_pred_mslrx)
            list_y_pred_mslr.append(tmp_list_y_pred_mslr)
            list_y_pred_mslrxsolu3.append(tmp_list_y_pred_mslrxsolu3)
            list_y_pred_ar.append(tmp_list_y_pred_ar)
            list_y_pred_smap.append(tmp_list_y_pred_smap)
        
        print("seq % s finished" % cc, str(datetime.datetime.now() - start_time))
        
    list_y_date, list_y_true, list_y_pred_naive = np.concatenate(list_y_date).flatten(), np.concatenate(list_y_true).flatten(), np.concatenate(list_y_pred_naive).flatten()
    list_y_pred_mslrx, list_y_pred_mslr, list_y_pred_mslrxsolu3, list_y_pred_ar, list_y_pred_smap = np.concatenate(list_y_pred_mslrx).flatten(), np.concatenate(list_y_pred_mslr).flatten(), np.concatenate(list_y_pred_mslrxsolu3).flatten(), np.concatenate(list_y_pred_ar).flatten(), np.concatenate(list_y_pred_smap).flatten() 
    list_y_seq, list_y_seq_index = np.concatenate(list_y_seq).flatten(), np.concatenate(list_y_seq_index).flatten()
    
    dict_method_pred = {"date": list_y_date, 
                        "true": list_y_true, 
                        "pred_naive": list_y_pred_naive, 
                        "pred_mslrx": list_y_pred_mslrx,
                        "pred_mslr": list_y_pred_mslr,
                        "pred_mslrxsolu3": list_y_pred_mslrxsolu3, 
                        "pred_ar": list_y_pred_ar,
                        "pred_smap": list_y_pred_smap,
                        "seq": list_y_seq,
                        "seq_index": list_y_seq_index,
                       }
                
    return pd.DataFrame(dict_method_pred)


In [33]:

is_log = True
is_phase = True

dict_setting_results = dict()

#for is_cross in [True, False]:
for is_cross in [True, ]:
    for horizon_forecast in [1, 3, 7]:
        for feature_removed in [None, "salinity_max", "salinity_min", "ODO", "turbidity", "pH", "temperature", "wind"]:
                
                tmp_setting = (is_log, is_cross, is_phase, horizon_forecast, "LinearRegression", )
                
                print()
                print("================================================")
                print("setting =", tmp_setting)
                print("removed feature =", feature_removed)
                print("================================================")
                
                tmp_res_df = _main(data_WW_byday, n_seq_warmup=1, 
                                  #reg_method = "LinearSVR", reg_method_func = LinearSVR, kargs_reg = {"random_state": 434},
                                  reg_method = "LinearRegression", reg_method_func = LinearRegression, kargs_reg = {},
                                  is_log = is_log, is_cross = is_cross, is_phase = is_phase, horizon_forecast = horizon_forecast, is_mslr = False,
                                   list_unused_feature = [feature_removed] if feature_removed is not None else None,
                                  )
                
                dict_setting_results[tmp_setting] = tmp_res_df
                
                tail_log = "_log" if is_log else ""
                tail_cross = "_X" if is_cross else ""
                tail_horizon = "% sD" % horizon_forecast
                tail_method = "LR"
                
                tail_removed_features = "EX" + feature_removed.replace("_", "").lower() if feature_removed is not None else "allX" 
                
                #tail_max = "2ndmax"
                tail_max = "1stmax"
                #tail_max = "90qmax"
                
                fname_write = "vimsWW_% s_pred_% s_% s_% s% s% s.csv" % (tail_max, tail_removed_features, tail_method, tail_horizon, tail_cross, tail_log, )
                tmp_res_df.to_csv(os.path.join("res", "vimsWW_pred_largelag", fname_write), index = False)



setting = (True, True, True, 1, 'LinearRegression')
removed feature = None
Subseqs:
	 0 2018-11-01 2019-01-22 84
	 1 2019-02-02 2019-03-24 52
	 2 2019-03-27 2019-06-24 91
	 3 2019-06-27 2019-07-25 30
	 4 2019-08-02 2019-11-16 108
	 5 2019-11-20 2020-02-12 86
	 6 2020-02-15 2020-03-27 43
	 7 2020-03-31 2020-07-16 109
	 8 2020-09-04 2020-09-25 23
	 9 2021-02-09 2021-03-07 28
	 10 2021-04-01 2021-06-28 90
	 11 2021-07-01 2021-07-21 22
	 12 2021-07-29 2021-08-31 35
	 13 2021-09-03 2021-10-27 56
	 14 2021-10-30 2022-01-01 65
	 15 2022-01-15 2022-03-26 72
	 16 2022-03-29 2022-07-23 118
	 17 2022-07-28 2022-12-09 136

seq 1 finished 0:00:00.502968
seq 2 finished 0:00:02.265354
seq 3 finished 0:00:02.747354
seq 4 finished 0:00:06.354327
seq 5 finished 0:00:10.360492
seq 6 finished 0:00:11.935651
seq 7 finished 0:00:18.134378
seq 8 finished 0:00:18.682378
seq 9 finished 0:00:19.624412
seq 10 finished 0:00:25.975659
seq 11 finished 0:00:26.631659
seq 12 finished 0:00:28.403198
seq 13 finished 0

In [42]:
def prslr_feature_selection(X, Y, exog, n_components, covariance_type, n_iter, is_multiple_sequence = False, method = "mslrx"):
    
    S = n_components
    
    if is_multiple_sequence:
        p_X, p_Y = len(X[0][0]), len(Y[0][0])
        X = [np.array(i) for i in X]
        Y = [np.array(i) for i in Y]
    else:
        p_X, p_Y = len(X[0]), len(Y[0])
        X, Y = np.array(X), np.array(Y)
    
    if method.lower() == "mslrx":
        method_cls = MSLRX
    elif method.lower() == "mslrxsolu3":
        method_cls = MSLRXSoluIII
    elif method.lower() == "mslr":
        method_cls = MSLR
    else:
        print("Error: no method = % s" % method)
        return
        
    delta_df = p_Y * S
    
    mslrx = method_cls(n_components = n_components, covariance_type=covariance_type, n_iter = n_iter)
    if method.lower() == "mslr":
        mslrx.fit(X, Y, is_multiple_sequence = is_multiple_sequence)
    else:
        mslrx.fit(X, Y, exog, is_multiple_sequence = is_multiple_sequence)
    
    
    loglik_full = mslrx.list_loglik_[-1]
    
    list_loglik_reduced, list_pvalues = [], []
    for ii in range(p_X):
        list_used_features = [i for i in range(p_X) if i != ii]
        mslrx = method_cls(n_components = n_components, covariance_type=covariance_type, n_iter = n_iter)
        
        if is_multiple_sequence:
            X_reduced = [i[:, list_used_features] for i in X]
        else:
            X_reduced = X[:, list_used_features]
            
        if method.lower() == "mslr":
            mslrx.fit(X_reduced, Y, is_multiple_sequence = is_multiple_sequence)
        else:
            mslrx.fit(X_reduced, Y, exog, is_multiple_sequence = is_multiple_sequence)
            
        tmp_loglik_reduced = mslrx.list_loglik_[-1]
        list_loglik_reduced.append(tmp_loglik_reduced)
        
        tmp_pvalue = scipy.stats.chi2.sf(-2 * (tmp_loglik_reduced - loglik_full), delta_df)
        list_pvalues.append(tmp_pvalue)
        
    return list_loglik_reduced, list_pvalues, loglik_full
    

In [43]:
def _main(data_byday, horizon_forecast = 1, method_mslr = "mslrx"):
    
    is_cross = True
    is_log = True
    is_phase = True
    
    list_features = ["salinity_max", "salinity_min", "ODO", "turbidity", "pH", "temperature", "wind"]

    array_datetime = data_byday["date"].to_numpy()
    array_cross = data_byday[list_features].to_numpy()
    
    if is_log:
        array_obs = data_byday["log10_chlorophyll"].to_numpy()
    else:
        array_obs = data_byday["chlorophyll"].to_numpy()
    
    list_split_index = list(np.where(np.isnan(array_obs))[0])
    list_split_index.append(len(array_obs))
    list_split_index.insert(0, 0)
    list_subseq, list_subseq_cross,list_subseq_datetime = [], [], []
    for tmp_index in range(len(list_split_index) - 1):
        tmp_index_left = list_split_index[tmp_index] + 1
        tmp_index_right = list_split_index[tmp_index + 1]
        if tmp_index_right - tmp_index_left < 21:
            continue
        list_subseq.append(array_obs[tmp_index_left:tmp_index_right])
        list_subseq_datetime.append(array_datetime[tmp_index_left:tmp_index_right])
        list_subseq_cross.append(array_cross[tmp_index_left:tmp_index_right])
    
    list_p_AR = [2, 1, ]
    list_p_AR_cross = [1, ]
    list_p_AR.sort(reverse = True)
    max_p = max(max(list_p_AR), max(list_p_AR_cross) if len(list_p_AR_cross) != 0 else 0)

    list_X, list_Y, list_exog = [], [], []
    list_date = []
    for tmp_subseq, tmp_subseq_datetime, tmp_subseq_cross in zip(list_subseq, list_subseq_datetime, list_subseq_cross):
        X, Y, exog = [], [], []
        tmp_list_date = []
        for tmp_index_right in range(max_p, len(tmp_subseq) - horizon_forecast + 1):

            tmp_index_test = tmp_index_right - 1 + horizon_forecast # tmp_index_right == tmp_index_test if horizon_forecast = 1

            tmp_X = []
            if is_cross:
                for tmp_p in list_p_AR_cross:
                    tmp_X.extend(tmp_subseq_cross[tmp_index_right - tmp_p])
            for tmp_p in list_p_AR:
                tmp_X.append(tmp_subseq[tmp_index_right - tmp_p])
            X.append(tmp_X)
            Y.append([tmp_subseq[tmp_index_test]])

            tmp_date = tmp_subseq_datetime[tmp_index_test]
            tmp_list_date.append(tmp_date)
            
            tmp_t = pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).toordinal() - pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).replace(month = 1, day = 1).toordinal()
            if is_phase:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi), np.sin(tmp_t / 365 * 2 * np.pi)]
            else:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi)]
            exog.append(tmp_exog)

        list_X.append(X)
        list_Y.append(Y)
        list_exog.append(exog)
        list_date.append(tmp_list_date)
    
    print("Subseqs:")
    for tmp_index in range(len(list_subseq)):
        print("\t", tmp_index, pd.Timestamp(list_subseq_datetime[tmp_index][1]).date(), pd.Timestamp(list_subseq_datetime[tmp_index][-1]).date(), len(list_subseq[tmp_index]))
    print()
    
    list_loglik_reduced, list_pvalues, loglik_full = prslr_feature_selection(list_X, list_Y, list_exog, is_multiple_sequence = True, 
                                                                             n_components = 2, covariance_type="diag", n_iter = 10, 
                                                                             method = method_mslr)
                
    return list_loglik_reduced, list_pvalues, loglik_full


In [44]:

dict_col_val = dict()

for horizon_forecast in [1, 3, 7]:
    for method_mslr in ["mslrx", "mslrxsolu3", "mslr"]:

        list_loglik_reduced, list_pvalues, loglik_full = _main(data_WW_byday, horizon_forecast = horizon_forecast, method_mslr = method_mslr)

        dict_col_val[(horizon_forecast, method_mslr)] = list_pvalues

tmp_res_df = pd.DataFrame(dict_col_val, index = ["salinity_max", "salinity_min", "ODO", "turbidity", "pH", "temperature", "wind", "AR2", "AR1"])

tmp_res_df.to_csv(r"res/res_vimsww_% s_insample_feature_pvalues.csv" % tail_max)


Subseqs:
	 0 2018-11-01 2019-01-22 84
	 1 2019-02-02 2019-03-24 52
	 2 2019-03-27 2019-06-24 91
	 3 2019-06-27 2019-07-25 30
	 4 2019-08-02 2019-11-16 108
	 5 2019-11-20 2020-02-12 86
	 6 2020-02-15 2020-03-27 43
	 7 2020-03-31 2020-07-16 109
	 8 2020-09-04 2020-09-25 23
	 9 2021-02-09 2021-03-07 28
	 10 2021-04-01 2021-06-28 90
	 11 2021-07-01 2021-07-21 22
	 12 2021-07-29 2021-08-31 35
	 13 2021-09-03 2021-10-27 56
	 14 2021-10-30 2022-01-01 65
	 15 2022-01-15 2022-03-26 72
	 16 2022-03-29 2022-07-23 118
	 17 2022-07-28 2022-12-09 136

Subseqs:
	 0 2018-11-01 2019-01-22 84
	 1 2019-02-02 2019-03-24 52
	 2 2019-03-27 2019-06-24 91
	 3 2019-06-27 2019-07-25 30
	 4 2019-08-02 2019-11-16 108
	 5 2019-11-20 2020-02-12 86
	 6 2020-02-15 2020-03-27 43
	 7 2020-03-31 2020-07-16 109
	 8 2020-09-04 2020-09-25 23
	 9 2021-02-09 2021-03-07 28
	 10 2021-04-01 2021-06-28 90
	 11 2021-07-01 2021-07-21 22
	 12 2021-07-29 2021-08-31 35
	 13 2021-09-03 2021-10-27 56
	 14 2021-10-30 2022-01-01 65
	 15 

In [46]:
def prslr_testing_prs(X, Y, exog, n_components, covariance_type, n_iter, is_multiple_sequence = False, method = "mslrx"):
    
    S = n_components
    
    if method.lower() == "mslrx":
        method_cls = MSLRX
    elif method.lower() == "mslrxsolu3":
        method_cls = MSLRXSoluIII
    else:
        print("Error: no method = % s" % method)
        return
        
    delta_df = 2 * S * (S - 1)
    
    mslrx = method_cls(n_components = n_components, covariance_type=covariance_type, n_iter = n_iter)
    mslrx.fit(X, Y, exog, is_multiple_sequence = is_multiple_sequence)
    loglik_prslr = mslrx.list_loglik_[-1]
    
    mslr = MSLR(n_components = n_components, covariance_type=covariance_type, n_iter = n_iter)
    mslr.fit(X, Y, is_multiple_sequence = is_multiple_sequence)
    loglik_rslr = mslr.list_loglik_[-1]
    
    pvalue = scipy.stats.chi2.sf(-2 * (loglik_rslr - loglik_prslr), delta_df)
    
    return pvalue, loglik_prslr, loglik_rslr
    

In [47]:
def _main(data_byday, horizon_forecast = 1, method_mslr = "mslrx"):
    
    is_cross = True
    is_log = True
    is_phase = True
    
    list_features = ["salinity_max", "salinity_min", "ODO", "turbidity", "pH", "temperature", "wind"]

    array_datetime = data_byday["date"].to_numpy()
    array_cross = data_byday[list_features].to_numpy()
    
    if is_log:
        array_obs = data_byday["log10_chlorophyll"].to_numpy()
    else:
        array_obs = data_byday["chlorophyll"].to_numpy()
    
    list_split_index = list(np.where(np.isnan(array_obs))[0])
    list_split_index.append(len(array_obs))
    list_split_index.insert(0, 0)
    list_subseq, list_subseq_cross,list_subseq_datetime = [], [], []
    for tmp_index in range(len(list_split_index) - 1):
        tmp_index_left = list_split_index[tmp_index] + 1
        tmp_index_right = list_split_index[tmp_index + 1]
        if tmp_index_right - tmp_index_left < 21:
            continue
        list_subseq.append(array_obs[tmp_index_left:tmp_index_right])
        list_subseq_datetime.append(array_datetime[tmp_index_left:tmp_index_right])
        list_subseq_cross.append(array_cross[tmp_index_left:tmp_index_right])
    
    list_p_AR = [2, 1, ]
    list_p_AR_cross = [1, ]
    list_p_AR.sort(reverse = True)
    max_p = max(max(list_p_AR), max(list_p_AR_cross) if len(list_p_AR_cross) != 0 else 0)

    list_X, list_Y, list_exog = [], [], []
    list_date = []
    for tmp_subseq, tmp_subseq_datetime, tmp_subseq_cross in zip(list_subseq, list_subseq_datetime, list_subseq_cross):
        X, Y, exog = [], [], []
        tmp_list_date = []
        for tmp_index_right in range(max_p, len(tmp_subseq) - horizon_forecast + 1):

            tmp_index_test = tmp_index_right - 1 + horizon_forecast # tmp_index_right == tmp_index_test if horizon_forecast = 1

            tmp_X = []
            if is_cross:
                for tmp_p in list_p_AR_cross:
                    tmp_X.extend(tmp_subseq_cross[tmp_index_right - tmp_p])
            for tmp_p in list_p_AR:
                tmp_X.append(tmp_subseq[tmp_index_right - tmp_p])
            X.append(tmp_X)
            Y.append([tmp_subseq[tmp_index_test]])

            tmp_date = tmp_subseq_datetime[tmp_index_test]
            tmp_list_date.append(tmp_date)
            
            tmp_t = pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).toordinal() - pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).replace(month = 1, day = 1).toordinal()
            if is_phase:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi), np.sin(tmp_t / 365 * 2 * np.pi)]
            else:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi)]
            exog.append(tmp_exog)

        list_X.append(X)
        list_Y.append(Y)
        list_exog.append(exog)
        list_date.append(tmp_list_date)
    
    print("Subseqs:")
    for tmp_index in range(len(list_subseq)):
        print("\t", tmp_index, pd.Timestamp(list_subseq_datetime[tmp_index][1]).date(), pd.Timestamp(list_subseq_datetime[tmp_index][-1]).date(), len(list_subseq[tmp_index]))
    print()
    
    pvalue, loglik_prslr, loglik_rslr = prslr_testing_prs(list_X, list_Y, list_exog, n_components = 2, covariance_type="diag", n_iter = 10, is_multiple_sequence = True, method = method_mslr)
                
    return pvalue, loglik_prslr, loglik_rslr


In [48]:

dict_col_val = dict()

for horizon_forecast in [1, 3, 7]:
    for method_mslr in ["mslrx", "mslrxsolu3", ]:

        pvalue, loglik_prslr, loglik_rslr = _main(data_WW_byday, horizon_forecast = horizon_forecast, method_mslr = method_mslr)

        dict_col_val[(horizon_forecast, method_mslr)] = [pvalue]

tmp_res_df = pd.DataFrame(dict_col_val)

tmp_res_df.to_csv(r"res/res_vimsww_% s_insample_prs_pvalues.csv" % tail_max)


Subseqs:
	 0 2018-11-01 2019-01-22 84
	 1 2019-02-02 2019-03-24 52
	 2 2019-03-27 2019-06-24 91
	 3 2019-06-27 2019-07-25 30
	 4 2019-08-02 2019-11-16 108
	 5 2019-11-20 2020-02-12 86
	 6 2020-02-15 2020-03-27 43
	 7 2020-03-31 2020-07-16 109
	 8 2020-09-04 2020-09-25 23
	 9 2021-02-09 2021-03-07 28
	 10 2021-04-01 2021-06-28 90
	 11 2021-07-01 2021-07-21 22
	 12 2021-07-29 2021-08-31 35
	 13 2021-09-03 2021-10-27 56
	 14 2021-10-30 2022-01-01 65
	 15 2022-01-15 2022-03-26 72
	 16 2022-03-29 2022-07-23 118
	 17 2022-07-28 2022-12-09 136

Subseqs:
	 0 2018-11-01 2019-01-22 84
	 1 2019-02-02 2019-03-24 52
	 2 2019-03-27 2019-06-24 91
	 3 2019-06-27 2019-07-25 30
	 4 2019-08-02 2019-11-16 108
	 5 2019-11-20 2020-02-12 86
	 6 2020-02-15 2020-03-27 43
	 7 2020-03-31 2020-07-16 109
	 8 2020-09-04 2020-09-25 23
	 9 2021-02-09 2021-03-07 28
	 10 2021-04-01 2021-06-28 90
	 11 2021-07-01 2021-07-21 22
	 12 2021-07-29 2021-08-31 35
	 13 2021-09-03 2021-10-27 56
	 14 2021-10-30 2022-01-01 65
	 15 

In [57]:
def _main(data_byday, 
          is_log = True, is_cross = True, is_phase = True, horizon_forecast = 1, thres_quantile = 0.95, 
          n_seq_warmup = 6, is_mslr = True, is_smap_cv = False, list_unused_feature = None,
          #reg_method = "LinearSVR", reg_method_func = LinearSVR, kargs_reg = {"random_state": 434},
          reg_method = "LinearRegression", reg_method_func = LinearRegression, kargs_reg = {},
         ):
    
    list_features = ["salinity_max", "salinity_min", "ODO", "turbidity", "pH", "temperature", "wind"]
    if list_unused_feature is not None:
        for tmp_feature in list_unused_feature:
            list_features.remove(tmp_feature)
            
    if is_log:
        array_obs = data_byday["log10_chlorophyll"].to_numpy()
    else:
        array_obs = data_byday["chlorophyll"].to_numpy()

    array_datetime = data_byday["date"].to_numpy()
    array_cross = data_byday[list_features].to_numpy()
    
    list_split_index = list(np.where(np.isnan(array_obs))[0])
    list_split_index.append(len(array_obs))
    list_split_index.insert(0, 0)
    list_subseq, list_subseq_cross,list_subseq_datetime = [], [], []
    for tmp_index in range(len(list_split_index) - 1):
        tmp_index_left = list_split_index[tmp_index] + 1
        tmp_index_right = list_split_index[tmp_index + 1]
        if tmp_index_right - tmp_index_left < 21:
            continue
        list_subseq.append(array_obs[tmp_index_left:tmp_index_right])
        list_subseq_datetime.append(array_datetime[tmp_index_left:tmp_index_right])
        list_subseq_cross.append(array_cross[tmp_index_left:tmp_index_right])
    
    list_p_AR = [2, 1, ]
    list_p_AR_cross = [1, ]
    list_p_AR.sort(reverse = True)
    max_p = max(max(list_p_AR), max(list_p_AR_cross) if len(list_p_AR_cross) != 0 else 0)

    list_X, list_Y, list_exog = [], [], []
    list_date = []
    for tmp_subseq, tmp_subseq_datetime, tmp_subseq_cross in zip(list_subseq, list_subseq_datetime, list_subseq_cross):
        X, Y, exog = [], [], []
        tmp_list_date = []
        for tmp_index_right in range(max_p, len(tmp_subseq) - horizon_forecast + 1):

            tmp_index_test = tmp_index_right - 1 + horizon_forecast # tmp_index_right == tmp_index_test if horizon_forecast = 1

            tmp_X = []
            if is_cross:
                for tmp_p in list_p_AR_cross:
                    tmp_X.extend(tmp_subseq_cross[tmp_index_right - tmp_p])
            for tmp_p in list_p_AR:
                tmp_X.append(tmp_subseq[tmp_index_right - tmp_p])
            X.append(tmp_X)
            Y.append([tmp_subseq[tmp_index_test]])

            tmp_date = tmp_subseq_datetime[tmp_index_test]
            tmp_list_date.append(tmp_date)
            
            tmp_t = pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).toordinal() - pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).replace(month = 1, day = 1).toordinal()
            if is_phase:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi), np.sin(tmp_t / 365 * 2 * np.pi)]
            else:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi)]
            exog.append(tmp_exog)

        list_X.append(X)
        list_Y.append(Y)
        list_exog.append(exog)
        list_date.append(tmp_list_date)
    
    print("Subseqs:")
    for tmp_index in range(len(list_subseq)):
        print("\t", tmp_index, pd.Timestamp(list_subseq_datetime[tmp_index][1]).date(), pd.Timestamp(list_subseq_datetime[tmp_index][-1]).date(), len(list_subseq[tmp_index]))
    print()
    
    # Prediction
    
    start_time = datetime.datetime.now()

    list_y_true, list_y_pred_naive = [], []
    list_y_pred_mslrx, list_y_pred_mslr = [], []
    list_y_pred_mslrxsolu3 = []
    list_y_pred_smap, list_y_pred_ar = [], []
    list_y_date = []
    list_y_seq, list_y_seq_index = [], []
    for cc in range(n_seq_warmup, len(list_X)):
        
        tmp_list_y_true, tmp_list_y_pred_naive = [], []
        tmp_list_y_pred_mslrx, tmp_list_y_pred_mslr = [], []
        tmp_list_y_pred_mslrxsolu3 = []
        tmp_list_y_pred_smap, tmp_list_y_pred_ar = [], []
        tmp_list_y_date = []
        tmp_list_y_seq, tmp_list_y_seq_index = [], []
        for tmp_index_test in range(5, len(list_X[cc])):
            tmp_list_X, tmp_list_Y, tmp_list_exog = [], [], []
            tmp_list_X_test = []
            for ii in range(cc):
                tmp_list_X.append(copy.deepcopy(list_X[ii]))
                tmp_list_Y.append(copy.deepcopy(list_Y[ii]))
                tmp_list_exog.append(copy.deepcopy(list_exog[ii]))
                tmp_list_X_test.append(copy.deepcopy(list_X[ii][-1]))
            tmp_list_X.append(copy.deepcopy(list_X[cc][:tmp_index_test]))
            tmp_list_Y.append(copy.deepcopy(list_Y[cc][:tmp_index_test]))
            tmp_list_exog.append(copy.deepcopy(list_exog[cc][:tmp_index_test]))
            tmp_list_X_test.append(copy.deepcopy(list_X[cc][tmp_index_test]))
            
            tmp_y_true = list_Y[cc][tmp_index_test]
            tmp_y_pred_naive = list_X[cc][tmp_index_test][-1]
            
            if is_mslr:

                mslrx = MSLRX(n_components = 2, covariance_type="diag", n_iter = 10, is_logistic_regression_CV = False,
                              logistic_regression_C = 1e10, is_logistic_regression_standardized = False, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslrx.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_exog, tmp_list_X_test, is_multiple_sequence = True)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslrx.append(tmp_y_pred)

                mslrx = MSLRXSoluIII(n_components = 2, covariance_type="diag", n_iter = 10, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslrx.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_exog, tmp_list_X_test, is_multiple_sequence = True)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslrxsolu3.append(tmp_y_pred)

                mslr = MSLR(n_components = 2, covariance_type="diag", n_iter = 10, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslr.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_X_test, is_multiple_sequence = True)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslr.append(tmp_y_pred)
                
            else:
                tmp_list_y_pred_mslrx.append(0.)
                tmp_list_y_pred_mslrxsolu3.append(0.)
                tmp_list_y_pred_mslr.append(0.)
            
            lr = reg_method_func(**kargs_reg)
            lr.fit(np.concatenate(tmp_list_X), np.concatenate(tmp_list_Y).flatten())
            tmp_pred = lr.predict(tmp_list_X_test)
            tmp_y_pred = tmp_pred[-1]
            tmp_list_y_pred_ar.append(tmp_y_pred)
            
            if is_smap_cv:
                lr = SMapCV(thetas = (0.5, 1.0, 1.5, ), reg_method = reg_method, kargs_reg = kargs_reg)
            else:
                lr = SMap(theta = 0.5, reg_method = reg_method, kargs_reg = kargs_reg)
            lr.fit(np.concatenate(tmp_list_X), np.concatenate(tmp_list_Y).flatten())
            tmp_pred = lr.predict(tmp_list_X_test)
            tmp_y_pred = tmp_pred[-1]
            tmp_list_y_pred_smap.append(tmp_y_pred)

            tmp_list_y_true.append(tmp_y_true)
            tmp_list_y_pred_naive.append(tmp_y_pred_naive)
            tmp_list_y_date.append(list_date[cc][tmp_index_test])
            tmp_list_y_seq.append(cc)
            tmp_list_y_seq_index.append(tmp_index_test)
        
        list_y_date.append(tmp_list_y_date)
        list_y_seq.append(tmp_list_y_seq)
        list_y_seq_index.append(tmp_list_y_seq_index)
        if is_log:
            list_y_true.append(np.power(10, tmp_list_y_true))
            list_y_pred_naive.append(np.power(10, tmp_list_y_pred_naive))
            list_y_pred_mslrx.append(np.power(10, tmp_list_y_pred_mslrx))
            list_y_pred_mslr.append(np.power(10, tmp_list_y_pred_mslr))
            list_y_pred_mslrxsolu3.append(np.power(10, tmp_list_y_pred_mslrxsolu3))
            list_y_pred_ar.append(np.power(10, tmp_list_y_pred_ar))
            list_y_pred_smap.append(np.power(10, tmp_list_y_pred_smap))
        else:
            list_y_true.append(tmp_list_y_true)
            list_y_pred_naive.append(tmp_list_y_pred_naive)
            list_y_pred_mslrx.append(tmp_list_y_pred_mslrx)
            list_y_pred_mslr.append(tmp_list_y_pred_mslr)
            list_y_pred_mslrxsolu3.append(tmp_list_y_pred_mslrxsolu3)
            list_y_pred_ar.append(tmp_list_y_pred_ar)
            list_y_pred_smap.append(tmp_list_y_pred_smap)
        
        print("seq % s finished" % cc, str(datetime.datetime.now() - start_time))
        
    list_y_date, list_y_true, list_y_pred_naive = np.concatenate(list_y_date).flatten(), np.concatenate(list_y_true).flatten(), np.concatenate(list_y_pred_naive).flatten()
    list_y_pred_mslrx, list_y_pred_mslr, list_y_pred_mslrxsolu3, list_y_pred_ar, list_y_pred_smap = np.concatenate(list_y_pred_mslrx).flatten(), np.concatenate(list_y_pred_mslr).flatten(), np.concatenate(list_y_pred_mslrxsolu3).flatten(), np.concatenate(list_y_pred_ar).flatten(), np.concatenate(list_y_pred_smap).flatten() 
    list_y_seq, list_y_seq_index = np.concatenate(list_y_seq).flatten(), np.concatenate(list_y_seq_index).flatten()
    
    dict_method_pred = {"date": list_y_date, 
                        "true": list_y_true, 
                        "pred_naive": list_y_pred_naive, 
                        "pred_mslrx": list_y_pred_mslrx,
                        "pred_mslr": list_y_pred_mslr,
                        "pred_mslrxsolu3": list_y_pred_mslrxsolu3, 
                        "pred_ar": list_y_pred_ar,
                        "pred_smap": list_y_pred_smap,
                        "seq": list_y_seq,
                        "seq_index": list_y_seq_index,
                       }
                
    return pd.DataFrame(dict_method_pred)


In [59]:

is_log = True
is_phase = True

dict_setting_results = dict()

#for is_cross in [True, False]:
for is_cross in [False, ]:
    for horizon_forecast in [1, 3, 7]:
        for feature_removed in [None, ]:
                
                tmp_setting = (is_log, is_cross, is_phase, horizon_forecast, "LinearRegression", )
                
                print()
                print("================================================")
                print("setting =", tmp_setting)
                print("removed feature =", feature_removed)
                print("================================================")
                
                tmp_res_df = _main(data_WW_byday, n_seq_warmup=1, 
                                  #reg_method = "LinearSVR", reg_method_func = LinearSVR, kargs_reg = {"random_state": 434},
                                  reg_method = "LinearRegression", reg_method_func = LinearRegression, kargs_reg = {},
                                  is_log = is_log, is_cross = is_cross, is_phase = is_phase, horizon_forecast = horizon_forecast, is_mslr = True,
                                   list_unused_feature = [feature_removed] if feature_removed is not None else None,
                                  )
                
                dict_setting_results[tmp_setting] = tmp_res_df
                
                tail_log = "_log" if is_log else ""
                tail_cross = "_X" if is_cross else ""
                tail_horizon = "% sD" % horizon_forecast
                tail_method = "LR"
                
                if is_cross:
                    tail_removed_features = "EX" + feature_removed.replace("_", "").lower() if feature_removed is not None else "allX" 
                else:
                    tail_removed_features = "noX"
                
                #tail_max = "2ndmax"
                tail_max = "1stmax"
                #tail_max = "90qmax"
                
                fname_write = "vimsWW_% s_pred_% s_% s_% s% s% s.csv" % (tail_max, tail_removed_features, tail_method, tail_horizon, tail_cross, tail_log, )
                tmp_res_df.to_csv(os.path.join("res", "vimsWW_nocross", fname_write), index = False)



setting = (True, False, True, 1, 'LinearRegression')
removed feature = None
Subseqs:
	 0 2018-11-01 2019-01-22 84
	 1 2019-02-02 2019-03-24 52
	 2 2019-03-27 2019-06-24 91
	 3 2019-06-27 2019-07-25 30
	 4 2019-08-02 2019-11-16 108
	 5 2019-11-20 2020-02-12 86
	 6 2020-02-15 2020-03-27 43
	 7 2020-03-31 2020-07-16 109
	 8 2020-09-04 2020-09-25 23
	 9 2021-02-09 2021-03-07 28
	 10 2021-04-01 2021-06-28 90
	 11 2021-07-01 2021-07-21 22
	 12 2021-07-29 2021-08-31 35
	 13 2021-09-03 2021-10-27 56
	 14 2021-10-30 2022-01-01 65
	 15 2022-01-15 2022-03-26 72
	 16 2022-03-29 2022-07-23 118
	 17 2022-07-28 2022-12-09 136

seq 1 finished 0:01:57.627845
seq 2 finished 0:06:30.992472
seq 3 finished 0:08:02.776334
seq 4 finished 0:16:41.397650
seq 5 finished 0:24:51.175323
seq 6 finished 0:28:51.042156
seq 7 finished 0:41:16.107723
seq 8 finished 0:43:21.999266
seq 9 finished 0:46:12.731996
seq 10 finished 0:58:20.682834
seq 11 finished 1:00:42.530710
seq 12 finished 1:05:18.581499
seq 13 finished 

In [60]:
def _main(data_byday, 
          is_log = True, is_cross = True, is_phase = True, horizon_forecast = 1, thres_quantile = 0.95, 
          n_seq_warmup = 6, is_mslr = True, is_smap_cv = False, list_unused_feature = None,
          #reg_method = "LinearSVR", reg_method_func = LinearSVR, kargs_reg = {"random_state": 434},
          reg_method = "LinearRegression", reg_method_func = LinearRegression, kargs_reg = {},
         ):
    
    list_features = ["salinity_max", "salinity_min", "ODO", "turbidity", "pH", "temperature", "wind"]
    if list_unused_feature is not None:
        for tmp_feature in list_unused_feature:
            list_features.remove(tmp_feature)
            
    if is_log:
        array_obs = data_byday["log10_chlorophyll"].to_numpy()
    else:
        array_obs = data_byday["chlorophyll"].to_numpy()

    array_datetime = data_byday["date"].to_numpy()
    array_cross = data_byday[list_features].to_numpy()
    
    list_split_index = list(np.where(np.isnan(array_obs))[0])
    list_split_index.append(len(array_obs))
    list_split_index.insert(0, 0)
    list_subseq, list_subseq_cross,list_subseq_datetime = [], [], []
    for tmp_index in range(len(list_split_index) - 1):
        tmp_index_left = list_split_index[tmp_index] + 1
        tmp_index_right = list_split_index[tmp_index + 1]
        if tmp_index_right - tmp_index_left < 21:
            continue
        list_subseq.append(array_obs[tmp_index_left:tmp_index_right])
        list_subseq_datetime.append(array_datetime[tmp_index_left:tmp_index_right])
        list_subseq_cross.append(array_cross[tmp_index_left:tmp_index_right])
    
    list_p_AR = [2, 1, ]
    list_p_AR_cross = [1, ]
    list_p_AR.sort(reverse = True)
    max_p = max(max(list_p_AR), max(list_p_AR_cross) if len(list_p_AR_cross) != 0 else 0)

    list_X, list_Y, list_exog = [], [], []
    list_date = []
    for tmp_subseq, tmp_subseq_datetime, tmp_subseq_cross in zip(list_subseq, list_subseq_datetime, list_subseq_cross):
        X, Y, exog = [], [], []
        tmp_list_date = []
        for tmp_index_right in range(max_p, len(tmp_subseq) - horizon_forecast + 1):

            tmp_index_test = tmp_index_right - 1 + horizon_forecast # tmp_index_right == tmp_index_test if horizon_forecast = 1

            tmp_X = []
            if is_cross:
                for tmp_p in list_p_AR_cross:
                    tmp_X.extend(tmp_subseq_cross[tmp_index_right - tmp_p])
            for tmp_p in list_p_AR:
                tmp_X.append(tmp_subseq[tmp_index_right - tmp_p])
            X.append(tmp_X)
            Y.append([tmp_subseq[tmp_index_test]])

            tmp_date = tmp_subseq_datetime[tmp_index_test]
            tmp_list_date.append(tmp_date)
            
            tmp_t = pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).toordinal() - pd.Timestamp(tmp_subseq_datetime[tmp_index_test]).replace(month = 1, day = 1).toordinal()
            if is_phase:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi), np.sin(tmp_t / 365 * 2 * np.pi)]
            else:
                tmp_exog = [np.cos(tmp_t / 365 * 2 * np.pi)]
            exog.append(tmp_exog)

        list_X.append(X)
        list_Y.append(Y)
        list_exog.append(exog)
        list_date.append(tmp_list_date)
    
    print("Subseqs:")
    for tmp_index in range(len(list_subseq)):
        print("\t", tmp_index, pd.Timestamp(list_subseq_datetime[tmp_index][1]).date(), pd.Timestamp(list_subseq_datetime[tmp_index][-1]).date(), len(list_subseq[tmp_index]))
    print()
    
    # Prediction
    
    start_time = datetime.datetime.now()

    list_y_true, list_y_pred_naive = [], []
    list_y_pred_mslrx, list_y_pred_mslr = [], []
    list_y_pred_mslrxsolu3 = []
    list_y_pred_smap, list_y_pred_ar = [], []
    list_y_date = []
    list_y_seq, list_y_seq_index = [], []
    for cc in range(n_seq_warmup, len(list_X)):
        
        tmp_list_y_true, tmp_list_y_pred_naive = [], []
        tmp_list_y_pred_mslrx, tmp_list_y_pred_mslr = [], []
        tmp_list_y_pred_mslrxsolu3 = []
        tmp_list_y_pred_smap, tmp_list_y_pred_ar = [], []
        tmp_list_y_date = []
        tmp_list_y_seq, tmp_list_y_seq_index = [], []
        for tmp_index_test in range(4 + horizon_forecast, len(list_X[cc])):
            tmp_list_X, tmp_list_Y, tmp_list_exog = [], [], []
            tmp_list_X_test = []
            tmp_list_exog_additional = []
            for ii in range(cc):
                tmp_list_X.append(copy.deepcopy(list_X[ii]))
                tmp_list_Y.append(copy.deepcopy(list_Y[ii]))
                tmp_list_exog.append(copy.deepcopy(list_exog[ii]))
                tmp_list_X_test.append(copy.deepcopy(list_X[ii][-1]))
                if horizon_forecast != 1:
                    tmp_list_exog_additional.append(copy.deepcopy(list_exog[ii][(-horizon_forecast+1):]))
            tmp_list_X.append(copy.deepcopy(list_X[cc][:(tmp_index_test-horizon_forecast+1)]))
            tmp_list_Y.append(copy.deepcopy(list_Y[cc][:(tmp_index_test-horizon_forecast+1)]))
            tmp_list_exog.append(copy.deepcopy(list_exog[cc][:(tmp_index_test-horizon_forecast+1)]))
            tmp_list_X_test.append(copy.deepcopy(list_X[cc][tmp_index_test]))
            if horizon_forecast != 1:
                tmp_list_exog_additional.append(copy.deepcopy(list_exog[ii][(tmp_index_test-horizon_forecast+1):tmp_index_test]))
            
            tmp_y_true = list_Y[cc][tmp_index_test]
            tmp_y_pred_naive = list_X[cc][tmp_index_test][-1]
            
            if horizon_forecast ==  1:
                tmp_list_exog_additional = None
            
            if is_mslr:

                mslrx = MSLRX(n_components = 2, covariance_type="diag", n_iter = 10, is_logistic_regression_CV = False,
                              logistic_regression_C = 1e10, is_logistic_regression_standardized = False, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslrx.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_exog, tmp_list_X_test, is_multiple_sequence = True, exog_additional=tmp_list_exog_additional)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslrx.append(tmp_y_pred)

                mslrx = MSLRXSoluIII(n_components = 2, covariance_type="diag", n_iter = 10, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslrx.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_exog, tmp_list_X_test, is_multiple_sequence = True, exog_additional=tmp_list_exog_additional)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslrxsolu3.append(tmp_y_pred)

                mslr = MSLR(n_components = 2, covariance_type="diag", n_iter = 10, reg_method = reg_method, kargs_reg = kargs_reg)
                tmp_pred = mslr.fit_predict(tmp_list_X, tmp_list_Y, tmp_list_X_test, is_multiple_sequence = True, forecast_horizon=horizon_forecast)
                tmp_y_pred = tmp_pred[-1]
                tmp_list_y_pred_mslr.append(tmp_y_pred)
                
            else:
                tmp_list_y_pred_mslrx.append(0.)
                tmp_list_y_pred_mslrxsolu3.append(0.)
                tmp_list_y_pred_mslr.append(0.)
            
            lr = reg_method_func(**kargs_reg)
            lr.fit(np.concatenate(tmp_list_X), np.concatenate(tmp_list_Y).flatten())
            tmp_pred = lr.predict(tmp_list_X_test)
            tmp_y_pred = tmp_pred[-1]
            tmp_list_y_pred_ar.append(tmp_y_pred)
            
            if is_smap_cv:
                lr = SMapCV(thetas = (0.5, 1.0, 1.5, ), reg_method = reg_method, kargs_reg = kargs_reg)
            else:
                lr = SMap(theta = 0.5, reg_method = reg_method, kargs_reg = kargs_reg)
            lr.fit(np.concatenate(tmp_list_X), np.concatenate(tmp_list_Y).flatten())
            tmp_pred = lr.predict(tmp_list_X_test)
            tmp_y_pred = tmp_pred[-1]
            tmp_list_y_pred_smap.append(tmp_y_pred)

            tmp_list_y_true.append(tmp_y_true)
            tmp_list_y_pred_naive.append(tmp_y_pred_naive)
            tmp_list_y_date.append(list_date[cc][tmp_index_test])
            tmp_list_y_seq.append(cc)
            tmp_list_y_seq_index.append(tmp_index_test - horizon_forecast + 1)
        
        list_y_date.append(tmp_list_y_date)
        list_y_seq.append(tmp_list_y_seq)
        list_y_seq_index.append(tmp_list_y_seq_index)
        if is_log:
            list_y_true.append(np.power(10, tmp_list_y_true))
            list_y_pred_naive.append(np.power(10, tmp_list_y_pred_naive))
            list_y_pred_mslrx.append(np.power(10, tmp_list_y_pred_mslrx))
            list_y_pred_mslr.append(np.power(10, tmp_list_y_pred_mslr))
            list_y_pred_mslrxsolu3.append(np.power(10, tmp_list_y_pred_mslrxsolu3))
            list_y_pred_ar.append(np.power(10, tmp_list_y_pred_ar))
            list_y_pred_smap.append(np.power(10, tmp_list_y_pred_smap))
        else:
            list_y_true.append(tmp_list_y_true)
            list_y_pred_naive.append(tmp_list_y_pred_naive)
            list_y_pred_mslrx.append(tmp_list_y_pred_mslrx)
            list_y_pred_mslr.append(tmp_list_y_pred_mslr)
            list_y_pred_mslrxsolu3.append(tmp_list_y_pred_mslrxsolu3)
            list_y_pred_ar.append(tmp_list_y_pred_ar)
            list_y_pred_smap.append(tmp_list_y_pred_smap)
        
        print("seq % s finished" % cc, str(datetime.datetime.now() - start_time))
        
    list_y_date, list_y_true, list_y_pred_naive = np.concatenate(list_y_date).flatten(), np.concatenate(list_y_true).flatten(), np.concatenate(list_y_pred_naive).flatten()
    list_y_pred_mslrx, list_y_pred_mslr, list_y_pred_mslrxsolu3, list_y_pred_ar, list_y_pred_smap = np.concatenate(list_y_pred_mslrx).flatten(), np.concatenate(list_y_pred_mslr).flatten(), np.concatenate(list_y_pred_mslrxsolu3).flatten(), np.concatenate(list_y_pred_ar).flatten(), np.concatenate(list_y_pred_smap).flatten() 
    list_y_seq, list_y_seq_index = np.concatenate(list_y_seq).flatten(), np.concatenate(list_y_seq_index).flatten()
    
    dict_method_pred = {"date": list_y_date, 
                        "true": list_y_true, 
                        "pred_naive": list_y_pred_naive, 
                        "pred_mslrx": list_y_pred_mslrx,
                        "pred_mslr": list_y_pred_mslr,
                        "pred_mslrxsolu3": list_y_pred_mslrxsolu3, 
                        "pred_ar": list_y_pred_ar,
                        "pred_smap": list_y_pred_smap,
                        "seq": list_y_seq,
                        "seq_index": list_y_seq_index,
                       }
                
    return pd.DataFrame(dict_method_pred)


In [62]:

is_log = True
is_phase = True

dict_setting_results = dict()

for is_cross in [False, True]:
#for is_cross in [False, ]:
    for horizon_forecast in [1]:
        for feature_removed in [None]:
                
                tmp_setting = (is_log, is_cross, is_phase, horizon_forecast, "LinearRegression", )
                
                print()
                print("================================================")
                print("setting =", tmp_setting)
                print("removed feature =", feature_removed)
                print("================================================")
                
                tmp_res_df = _main(data_WW_byday, n_seq_warmup=1, 
                                  #reg_method = "LinearSVR", reg_method_func = LinearSVR, kargs_reg = {"random_state": 434},
                                  reg_method = "LinearRegression", reg_method_func = LinearRegression, kargs_reg = {},
                                  is_log = is_log, is_cross = is_cross, is_phase = is_phase, horizon_forecast = horizon_forecast, is_mslr = True,
                                   list_unused_feature = [feature_removed] if feature_removed is not None else None,
                                  )
                
                dict_setting_results[tmp_setting] = tmp_res_df
                
                tail_log = "_log" if is_log else ""
                tail_cross = "_X" if is_cross else ""
                tail_horizon = "% sD" % horizon_forecast
                tail_method = "LR"
                
                if is_cross:
                    tail_removed_features = "EX" + feature_removed.replace("_", "").lower() if feature_removed is not None else "allX" 
                else:
                    tail_removed_features = "noX" 
                
                #tail_max = "2ndmax"
                tail_max = "1stmax"
                #tail_max = "90qmax"
                
                fname_write = "vimsWW_% s_pred_% s_% s_% s% s% s.csv" % (tail_max, tail_removed_features, tail_method, tail_horizon, tail_cross, tail_log, )
                tmp_res_df.to_csv(os.path.join("res", "vimsWW_pred_truehorizon", fname_write), index = False)



setting = (True, False, True, 1, 'LinearRegression')
removed feature = None
Subseqs:
	 0 2018-11-01 2019-01-22 84
	 1 2019-02-02 2019-03-24 52
	 2 2019-03-27 2019-06-24 91
	 3 2019-06-27 2019-07-25 30
	 4 2019-08-02 2019-11-16 108
	 5 2019-11-20 2020-02-12 86
	 6 2020-02-15 2020-03-27 43
	 7 2020-03-31 2020-07-16 109
	 8 2020-09-04 2020-09-25 23
	 9 2021-02-09 2021-03-07 28
	 10 2021-04-01 2021-06-28 90
	 11 2021-07-01 2021-07-21 22
	 12 2021-07-29 2021-08-31 35
	 13 2021-09-03 2021-10-27 56
	 14 2021-10-30 2022-01-01 65
	 15 2022-01-15 2022-03-26 72
	 16 2022-03-29 2022-07-23 118
	 17 2022-07-28 2022-12-09 136

seq 1 finished 0:01:35.373192
seq 2 finished 0:05:42.160481
seq 3 finished 0:07:04.443502
seq 4 finished 0:14:24.376781
seq 5 finished 0:22:00.699770
seq 6 finished 0:25:50.066923
seq 7 finished 0:38:09.578464
seq 8 finished 0:40:15.202003
seq 9 finished 0:43:03.376741
seq 10 finished 0:55:12.293862
seq 11 finished 0:57:32.454958
seq 12 finished 1:02:13.823229
seq 13 finished 

  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + np.log(list_cur_list_trans_mat[index_X][tt][ii, jj]) + \
  tmp_log_prob.append(logsumexp([np.log(list_cur_list_trans_mat[index_X][tt][ii, j]) + list_cur_mat_log_b[index_X][tt + 1, j] + list_cur_log_backward_prob[index_X][tt + 1][j] for j in range(K)]))
  list_tmp_array3d_log_epsilon[index_X][tt, ii, jj] = list_cur_log_forward_prob[index_X][tt, ii] + n

seq 2 finished 0:05:22.299526
seq 3 finished 0:06:37.338005
seq 4 finished 0:13:28.887337
seq 5 finished 0:20:20.115780
seq 6 finished 0:23:54.950781
seq 7 finished 0:34:51.581781
seq 8 finished 0:36:41.581779
seq 9 finished 0:39:16.560429
seq 10 finished 0:49:52.692118
seq 11 finished 0:51:58.601634
seq 12 finished 0:55:57.747011
seq 13 finished 1:02:53.822838
seq 14 finished 1:11:26.750836
seq 15 finished 1:21:28.572836
seq 16 finished 1:40:19.667130
seq 17 finished 2:05:03.831714


## End of Notebook