In [1]:
import numpy as np
import torch.nn as nn
import torch
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from src.helpers.functions import get_data, pc_T, predict_pca, estimate_AR_res, generate_data
from src.helpers.functions import select_AR_lag_SIC, winsor, lag_matrix
from src.helpers.refactored import ar_forecast, scale_X, reduce_dimensions, loocv_ts, standardize, forecast
from src.helpers.autoencoder import Autoencoder
from src.helpers.lstm_ae import LSTMAutoencoder
from src.helpers.forecast import Forecast

In [2]:
def out_sample(X, y, dim_method="ae", scale_method="distance_correlation", h=1, hyper_params_grid=None, forecast_method="ols",forecast_params=None):
    """ Function to perform out of sample forecasting """
    T = y.shape[0]

    M = (1984-1959)*12  # In sample periods
    N = T - M  # Out of sample periods

    forecast_spca = np.zeros((N, 1))  # Forecast errors of scaled PCA
    forecast_ar = np.zeros((N, 1))  # Forecast errors of AR model
    actual_y = np.zeros((N, 1))  # Actual values of y

    # Initialize the models
    fc = Forecast(method=forecast_method, hyper_params=forecast_params, h=h)
    ae = None

    p_max = 3  # Max number of lags for AR(p) model
    
    # Loop over all out of sample periods
    for n in range(N):
        # Print every 20 percent
        if n % (N // 5) == 0:
            print(f"Out of sample period {n} out of {N} periods")
            
        # Use all available data up to time t
        X_t = X[:(M + n), :]
        y_t = y[:M + n]
        actual_y[n] = y[M + n]

        # Standardize the data
        X_t = standardize(X_t)

        # Get number lags
        p_AR_star_n = select_AR_lag_SIC(y_t, h, p_max=p_max)

        # Compute the forecast of the AR model
        forecast_ar[n] = ar_forecast(p_AR_star_n, y_t, h)
        
        #### STEP 1: Scaling factors ####

        # Compute the betas for scaling the variables
        beta = scale_X(X_t, y_t, h, method=scale_method, p_AR_star_n=p_AR_star_n)
        
        # Winsorizing the betas
        beta_win = winsor(np.abs(beta), p=(0, 90))

        # Scale the factors by the winsorized betas
        scaleX_t = X_t * beta_win

        #### Intermezzo: Find the optimal number of factors (or other hyperparameters) ####
        
        if n == 0:
            #print("Starting hyperparameter optimization")
            hyper_params = loocv_ts(X=X_t, y=y_t, h=h, p_AR_star_n=p_AR_star_n, method=dim_method, scale_method=scale_method, grid=hyper_params_grid)
            #print("Optimal Dimension Reduction hyperparameters found")

            if dim_method == "ae":
                # Initialize the autoencoder
                ae = Autoencoder(input_dim=X.shape[1], activation=nn.SiLU, hyper_params=hyper_params)

                # Train the autoencoder on the in sample data
                ae.train_model(scaleX_t, lr=hyper_params.get("lr", 0.001), num_epochs=hyper_params.get("epochs", 300))
            elif dim_method == "lstm":
                # Initialize the autoencoder
                ae = LSTMAutoencoder(input_dim=X.shape[1], hyper_params=hyper_params)

                # Train the autoencoder on the in sample data
                ae.train_model(scaleX_t, lr=hyper_params.get("lr", 0.001), num_epochs=hyper_params.get("epochs", 300))

                print("Autoencoder training done")

        ### Updating Hyperparameters durign forecasting ###
        if n % 60 == 0 and n > 0:
            if dim_method == "ae":
                pass
            else:
                hyper_params = loocv_ts(X=X_t, y=y_t, h=h, p_AR_star_n=p_AR_star_n, method=dim_method, scale_method=scale_method, grid=hyper_params_grid)
            
        #### STEP 2: Dimension Reduction ####            

        # Compute the reduced dimensionality representation of the factors
        x_spc = reduce_dimensions(X=scaleX_t, hyper_params=hyper_params, method=dim_method, dim_red_model=ae)

        #### STEP 3: Forecasting ####

        # Add lag of y_t to the factors
        if p_AR_star_n > 0:
            x_spc = lag_matrix(x_spc, y_t, p_AR_star_n)
            y_t = y_t[p_AR_star_n-1:]
                
        # Cross validate the hyperparameters once in first period
        if n == 0 and forecast_params:
            fc.cross_validate(x_spc, y_t, hyper_params=forecast_params)
        elif n > 0 and forecast_params:
            if n % 60 == 0:
                fc.cross_validate(x_spc, y_t, hyper_params=forecast_params)

        # Compute the forecast of the PCA and scaled PCA model
        forecast_spca[n] = fc.predict(x_spc, y_t)

    # Compute the forecast errors
    error_spca = actual_y - forecast_spca
    error_ar = actual_y - forecast_ar
    
    # Compute the R squared out of sample against the AR model
    SSE_spca = np.sum(error_spca**2)
    SSE_ar = np.sum(error_ar**2)

    R2_spca = (1 - SSE_spca / SSE_ar)

    print("R2_spca: ", round(R2_spca * 100, 2))
    
    return {"error_spca": error_spca,
            "error_ar": error_ar,
            "R2_spca": R2_spca,
            }

In [3]:
variables = get_data()
X = variables['data'].values

print("Shape of X: ", X.shape)

Shape of X:  (720, 123)


In [4]:
# Set seed of numpy and torch
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

# Dimension Reduction Hyperparameters
ae_params = {"hidden_dim": [10],
                "layer_dims": [[32, 16]],
                "dropout": [0.1],
                "lr": [0.001],
                "epochs": [300]
}

kpca_params = {"n_components": list(np.arange(1, 5, 1)),
                "kernel": ["poly"],
                "degree": [2, 4],
}

pca_params = {"nfac": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50]}

# Regression hyperparameters
kkr_params = {"alpha": [0.3, 0.5, 0.7],
              "kernel": ["rbf"],
              "gamma": [0.3, 0.5, 0.7],
}

rf_params = {
    "n_estimators": [200],
    "max_depth": [5],
    "max_features": ["sqrt"],
    "min_samples_split": [2],
    "min_samples_leaf": [1],
}

# Run the forecasting exercise
for target in variables.keys():
    if target == 'data':
        continue
    print("Target: ", target)
    y = variables[target].values
    result = out_sample(
        X = X,
        y = y,
        scale_method="regression",
        dim_method="pca",
        forecast_method="ols",
        hyper_params_grid=pca_params,
        h=12,
        forecast_params=None,)

Target:  inflation
Out of sample period 0 out of 420 periods
Number of model configurations:  16
Best model configuration:  {'nfac': 2}
Number of model configurations:  16
Best model configuration:  {'nfac': 2}
Out of sample period 84 out of 420 periods
Number of model configurations:  16
Best model configuration:  {'nfac': 2}
Out of sample period 168 out of 420 periods
Number of model configurations:  16
Best model configuration:  {'nfac': 2}
Number of model configurations:  16
Best model configuration:  {'nfac': 2}
Out of sample period 252 out of 420 periods
Number of model configurations:  16
Best model configuration:  {'nfac': 20}
Out of sample period 336 out of 420 periods
Number of model configurations:  16
Best model configuration:  {'nfac': 20}
R2_spca:  -26.39
Target:  unemployment
Out of sample period 0 out of 420 periods
Number of model configurations:  16


KeyboardInterrupt: 

In [None]:
errors_ar = result['error_ar']
errors_pca = result['error_pca']
errors_spca = result['error_spca']

#errors = pd.DataFrame({'errors_ar': errors_ar, 'errors_pca': errors_pca.flatten()})
#np.set_printoptions(formatter={'all':lambda x: str(x)[:7]})
np.set_printoptions(formatter={'float_kind':'{:f}'.format})
mse_ar = np.mean(errors_ar**2)
mse_pca = np.mean(errors_pca**2)
mse_spca = np.mean(errors_spca**2)

print("MSE AR: ", mse_ar)
print("MSE PCA: ", mse_pca)
print("MSE SPCA: ", mse_spca)