In [1]:
# Imports
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from src.helpers.functions import lag_matrix, select_AR_lag_SIC
from src.helpers.refactored import reduce_dimensions, loocv_ts, loocv_ts_bayes, winsor, standardize, ar_predict, scale_X
from src.helpers.autoencoder import Autoencoder
import torch.nn as nn

%reload_ext autoreload
%autoreload 2

In [62]:
def gen_data(T, N, h_steps=1, linear=True, n_fac = 3):
    """ 
    Generates data for the monte carlo simulation

    Parameters
    ----------
    T : int
        Number of time steps
    N : int
        Number of variables in X
    h_steps : int
        Number of steps ahead forecasted

    Returns
    -------
    X : ndarray
        T x N array of data
    Y : ndarray
        T x 1 array of data
    """
    T_plus_h = T + h_steps

    # First generate the factors: f
    f = np.random.normal(size=(n_fac, T_plus_h))

    # Generate the forecast target: Y
    y_h = np.sum(f, axis=0) + np.random.normal(size=(T_plus_h,))

    # Shift Y to get the target: y_t
    y_t = np.zeros(T_plus_h)
    y_t[h_steps:T_plus_h] = y_h[:T]
    y_t = y_t[h_steps:]

    f = f[:, h_steps:]

    # Generate the data: X
    X, true_f = F(f, linear=linear, N=N)

    return X, y_t, true_f

def F(f, linear=True, N=500):
    """ 
    Function to generate X from f

    Parameters
    ----------
    f : ndarray
        n_fac x T dimensional factor
    linear : bool
        Whether to generate X linearly or not

    Returns
    -------
    X : ndarray
        T x N array of data
    """
    n_fac, T = f.shape

    if linear:
        lambda_m = np.random.normal(size=(N, n_fac))

        # Randomly set x% of the rows of lambda to 0
        lambda_m[np.random.choice(N, int(N*0.90), replace=False), :] = 0

        # Generate X linearly from f by multiplying with lambda
        X = np.dot(f.T, lambda_m.T)
    else:
        # Generate X nonlinearlity by expanding f with polyonomial and sign functions
        # Then multiply with lambda

        # First generate the polyonomial(2) expansion of f
        f_poly = PolynomialFeatures(2).fit_transform(f.T)

        # Add f_poly to the factor
        f = np.concatenate((f, f_poly.T), axis=0)
        
        # Then generate the sign function expansion
        f_sign = np.sign(f)

        # Add f_sign to the factor
        f = np.concatenate((f, f_sign), axis=0)
    
        # Generate lambda for nonlinear case
        n_fac = f.shape[0]
        lambda_m = np.random.normal(size=(N, n_fac))
        
        # Multiply with lambda to get X
        X = np.dot(f.T, lambda_m.T)

    # Define the variance of the indivual variables by drawing from a uniform distribution
    variance_u = np.random.uniform(0, 1, N) 
    u = np.random.multivariate_normal(mean=np.zeros(N), cov=np.diag(variance_u), size=T)

    # Add noise
    X += u
    return X, f

In [63]:
def simulation_forecasting(X, y, T, scale_method, dim_method,nfac, hyper_params_grid,h=1):
    """
    Function to do forecasting for within the simulation study

    Parameters
    ----------

    Returns
    -------
    forecasts : ndarray
        T_test x 1 array of forecasts
    
    scales: ndarray
        T_test x N x 2 array of scaling factors

    actual_values : ndarray
        T_test x 1 array of actual values

    """ 
    T = y.shape[0]

    M = 200  # In sample periods
    N = T - M  # Out of sample periods

    forecast_spca = np.zeros((N - h, 1))  # Forecast errors of scaled PCA
    actual_y = np.zeros((N - h, 1))  # Actual values of y
    betas = np.zeros((N - h, X.shape[1]))  # Betas for scaling the variables
    f_hat = np.zeros((N - h, nfac))  # Estimated factors

    # Initialize the models
    ae = None
    lr = LinearRegression()

    p_max = 3 # Max number of lags for AR(p) model
    
    # Loop over all out of sample periods
    for n in range(N - h):            
        # Use all available data up to time t
        X_t = X[:(M + n), :]
        y_t = y[:M + n]

        # Get the actual value of y for the forecast period t+h
        actual_y[n] = y[M + n + h - 1]

        # Standardize the data
        X_t = standardize(X_t)

        # Get number lags
        p_AR_star_n = select_AR_lag_SIC(y_t, h, p_max=p_max)
                
        #### STEP 1: Scaling factors ####

        # Compute the betas for scaling the variables
        beta = scale_X(X_t, y_t, h, method=scale_method)
        
        # Winsorizing the betas
        beta_win = winsor(np.abs(beta), p=(0, 100))

        # Save the betas
        betas[n, :] = beta_win

        # Scale the factors by the winsorized betas
        scaleX_t = X_t * beta_win

        #### Intermezzo: Find the optimal number of factors (or other hyperparameters) ####
        
        if n == 0:
            #print("Starting hyperparameter optimization")
            if dim_method != "ae":
                hyper_params = loocv_ts(X=X_t, y=y_t, h=h, p_AR_star_n=p_AR_star_n, method=dim_method,
                                         forecast_method="ols", scale_method=scale_method, grid=hyper_params_grid)
            else:
                # Use bayesian optimization to find the optimal hyperparameters
                hyper_params = loocv_ts_bayes(X=X_t, y=y_t, h=h, p_AR_star_n=p_AR_star_n, method=dim_method,
                                               scale_method=scale_method, space=hyper_params_grid, trials=10, forecast_method="ols")
            print("-----------------------------------------------------------------")
            
            if dim_method == "ae":
                # Initialize the autoencoder
                ae = Autoencoder(input_dim=X.shape[1], activation=nn.SiLU, hyper_params=hyper_params)

                # Train the autoencoder on the in sample data
                ae.train_model(scaleX_t, lr=hyper_params.get("lr", 0.001), num_epochs=hyper_params.get("epochs", 300))
            
        #### STEP 2: Dimension Reduction ####            

        # Compute the reduced dimensionality representation of the factors
        x_spc = reduce_dimensions(X=scaleX_t, hyper_params=hyper_params, method=dim_method, dim_red_model=ae)

        #### STEP 3: Forecasting ####

        # Compute the forecast of the PCA and scaled PCA model
        lr.fit(x_spc[:-h], y_t[h:])
        forecast_spca[n] = lr.predict(x_spc[-1].reshape(1, -1))

    # Compute the forecast errors
    error_spca = actual_y - forecast_spca
    
    # Compute the R squared out of sample against the AR model
    SSE_spca = np.sum(error_spca**2)

    print("MSE_spca: ", round(SSE_spca/len(error_spca), 10))
    
    return forecast_spca, betas, actual_y, f_hat

In [4]:
# Defining the grids for the hyperparameters
rbf_params = {"gamma": 10**np.arange(-6,-2.5,.5)
              }

ae_params = {
    "layer_dims": [[], [16], [32], [32, 32], [64, 32]],
    "epochs": [200, 300],
    "update_epochs": [10],
    'batch_size': [64, 100],
    'lr': [0.001, 0.01],
    'update_lr': [0.001, 0.01],
}

parameters = {'ae': ae_params,
              'rbf': rbf_params,
              'pca': {},
}

nfac_key = {
    'ae': 'hidden_dim',
    'rbf': 'n_components',
    'pca': 'nfac',
}

In [70]:
# Set the parameters
B = 100
T = 250
N = 500
h_steps = 1
n_fac = 2
np.random.seed(0)

for linear in [True, False]:
    if linear:
        factors = n_fac
    else:
        factors = (4*n_fac + n_fac * (n_fac + 1))//2
    for scaling_method in ['none', 'regression', 'distance_correlation']:
        for dim_method in ['pca', 'rbf', 'ae']:
            forecasts_full = np.zeros((B, 49)) # Forecasts
            betas_full = np.zeros((B, 49, N)) # Betas for the scaling
            actual_y_full = np.zeros((B, 49)) # Actual y values
            f_hat_full = np.zeros((B, 49, factors)) # Estimated factors
            f_full = np.zeros((B, 250, factors)) # True factors

            # Set factors for the dimension method
            parameters[dim_method][nfac_key[dim_method]] = [factors]

            print('Linear: {}, scaling_method: {}, dim_method: {}'.format(linear, scaling_method, dim_method))
            for r in range(B):
                if r % 10 == 0:
                    print('r: {}'.format(r))

                # Generate the data
                X, y, f = gen_data(T=T, N=N, h_steps=1, linear=True, n_fac=n_fac)

                # Do the forecasting
                results = simulation_forecasting(X=X, y=y, T=T, h=h_steps, scale_method=scaling_method, dim_method=dim_method,
                                                    hyper_params_grid=parameters[dim_method], nfac=factors)

                # Extract the results
                forecasts, betas, actual_values, f_hat = results
                # Save the results in the full arrays
                forecasts_full[r] = forecasts.flatten()
                betas_full[r] = betas
                actual_y_full[r] = actual_values.flatten()
                f_hat_full[r] = f_hat
                f_full[r] = f.T
                
            # Save the results to file
            np.save('resources/results/monte_carlo_good/forecasts_{}_{}_{}_{}.npy'.format(linear, scaling_method, dim_method, h_steps), forecasts_full)
            np.save('resources/results/monte_carlo_good/scales_{}_{}_{}_{}.npy'.format(linear, scaling_method, dim_method, h_steps), betas_full)
            np.save('resources/results/monte_carlo_good/actual_values_{}_{}_{}_{}.npy'.format(linear, scaling_method, dim_method, h_steps), actual_y_full)   
            np.save('resources/results/monte_carlo_good/f_hat_{}_{}_{}_{}.npy'.format(linear, scaling_method, dim_method, h_steps), f_hat_full)
            np.save('resources/results/monte_carlo_good/f_{}_{}_{}_{}.npy'.format(linear, scaling_method, dim_method, h_steps), f)

Linear: True, scaling_method: none, dim_method: pca
r: 0
-----------------------------------------------------------------
MSE_spca:  0.9884517596
-----------------------------------------------------------------
MSE_spca:  1.2572764551
-----------------------------------------------------------------
MSE_spca:  1.2103775872
-----------------------------------------------------------------
MSE_spca:  1.0023174106
-----------------------------------------------------------------
MSE_spca:  1.0730746244
-----------------------------------------------------------------
MSE_spca:  1.6888173616
-----------------------------------------------------------------
MSE_spca:  1.09783022
-----------------------------------------------------------------
MSE_spca:  1.2759088669
-----------------------------------------------------------------
MSE_spca:  0.9713608336
-----------------------------------------------------------------
MSE_spca:  1.0866149359
r: 10
---------------------------------------