In [192]:
import numpy as np
import torch.nn as nn
import torch
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from src.helpers.functions import get_data, pc_T, predict_pca, estimate_AR_res, generate_data
from src.helpers.functions import select_AR_lag_SIC, winsor, lag_matrix
from src.helpers.refactored import ar_forecast,ar_predict, scale_X, reduce_dimensions, loocv_ts, standardize, forecast, loocv_ts_bayes
from src.helpers.autoencoder import Autoencoder
from src.helpers.lstm_ae import LSTMAutoencoder
from src.helpers.forecast import Forecast

%reload_ext autoreload
%autoreload 2

In [242]:
def out_sample(X, y, dim_method="ae", scale_method="distance_correlation", h=1, hyper_params_grid=None, forecast_method="ols",forecast_params=None, update_period=60, target="missing"):
    """ Function to perform out of sample forecasting """
    T = y.shape[0]

    M = (2000-1959)*12  # In sample periods
    N = T - M  # Out of sample periods

    forecast_spca = np.zeros((N - h, 1))  # Forecast errors of scaled PCA
    forecast_ar = np.zeros((N - h, 1))  # Forecast errors of AR model
    actual_y = np.zeros((N - h, 1))  # Actual values of y

    # Initialize the models
    fc = Forecast(method=forecast_method, hyper_params=forecast_params, h=h)
    ae = None

    p_max = 3 # Max number of lags for AR(p) model
    
    # Loop over all out of sample periods
    for n in range(N - h):
        # Print every 20 percent
        if n % (N // 5) == 0:
            print(f"Out of sample period {n} out of {N} periods")
            
        # Use all available data up to time t
        X_t = X[:(M + n), :]
        y_t = y[:M + n]

        # Get the actual value of y for the forecast period t+h
        actual_y[n] = y[M + n + h - 1]

        # Standardize the data
        X_t = standardize(X_t)

        # Get number lags
        p_AR_star_n = select_AR_lag_SIC(y_t, h, p_max=p_max)

        # Compute the forecast of the AR model
        forecast_ar[n] = ar_predict(y_t, p_AR_star_n, h)
        
        #### STEP 1: Scaling factors ####

        # Compute the betas for scaling the variables
        beta = scale_X(X_t, y_t, h, method=scale_method, p_AR_star_n=p_AR_star_n)
        
        # Winsorizing the betas
        beta_win = winsor(np.abs(beta), p=(0, 90))

        # Scale the factors by the winsorized betas
        scaleX_t = X_t * beta_win

        #### Intermezzo: Find the optimal number of factors (or other hyperparameters) ####
        
        if n == 0:
            #print("Starting hyperparameter optimization")
            if dim_method != "ae":
                hyper_params = loocv_ts(X=X_t, y=y_t, h=h, p_AR_star_n=p_AR_star_n, method=dim_method,
                                        forecast_method=forecast_method, scale_method=scale_method, grid=hyper_params_grid)
            else:
                # Use bayesian optimization to find the optimal hyperparameters
                hyper_params = loocv_ts_bayes(X=X_t, y=y_t, h=h, p_AR_star_n=p_AR_star_n, method=dim_method,
                                               scale_method=scale_method, space=hyper_params_grid, trials=100)
            #print("Optimal Dimension Reduction hyperparameters found")
            print("-----------------------------------------------------------------")
            print("Initial hyperparameter optimization done")
            
            if dim_method == "ae":
                # Initialize the autoencoder
                ae = Autoencoder(input_dim=X.shape[1], activation=nn.SiLU, hyper_params=hyper_params)

                # Train the autoencoder on the in sample data
                ae.train_model(scaleX_t, lr=hyper_params.get("lr", 0.001), num_epochs=hyper_params.get("epochs", 300))
            elif dim_method == "lstm":
                # Initialize the autoencoder
                ae = LSTMAutoencoder(input_dim=X.shape[1], hyper_params=hyper_params)

                # Train the autoencoder on the in sample data
                ae.train_model(scaleX_t, lr=hyper_params.get("lr", 0.001), num_epochs=hyper_params.get("epochs", 300))

                print("Autoencoder training done")

        ### Updating Hyperparameters durign forecasting ###
        if n % update_period == 0 and n > 0:
            if dim_method == "ae":
                pass
            else:
                hyper_params = loocv_ts(X=scaleX_t, y=y_t, h=h, p_AR_star_n=p_AR_star_n, method=dim_method, scale_method=scale_method, grid=hyper_params_grid)
            
        #### STEP 2: Dimension Reduction ####            

        # Compute the reduced dimensionality representation of the factors
        x_spc = reduce_dimensions(X=scaleX_t, hyper_params=hyper_params, method=dim_method, dim_red_model=ae)

        #### STEP 3: Forecasting ####

        # Add lag of y_t to the factors
        if p_AR_star_n > 0:
            # Add lags of y to x
            x_spc = lag_matrix(x_spc, y_t, p_AR_star_n)

            # Remove the first p_AR_star_n observations of y_t
            y_t = y_t[(p_AR_star_n-1):]
                
        # Cross validate the hyperparameters once in first period
        if n == 0 and forecast_params:
            fc.cross_validate(x_spc, y_t, hyper_params=forecast_params)
        elif n > 0 and forecast_params:
            if n % update_period == 0:
                fc.cross_validate(x_spc, y_t, hyper_params=forecast_params)

        # Compute the forecast of the PCA and scaled PCA model
        forecast_spca[n] = fc.predict(x_spc, y_t)

    # Compute the forecast errors
    error_spca = actual_y - forecast_spca
    error_ar = actual_y - forecast_ar
    
    # Compute the R squared out of sample against the AR model
    SSE_spca = np.sum(error_spca**2)
    SSE_ar = np.sum(error_ar**2)

    print("MSE_spca: ", round(SSE_spca/len(error_ar), 10), "MSE_ar: ", round(SSE_ar/len(error_ar), 10))

    R2_spca = (1 - SSE_spca / SSE_ar)

    print("R2_spca: ", round(R2_spca * 100, 4))

    # Save the results to a numpy file for later use
    # If autoencoder, let know it is using bayesian optimization
    if dim_method == "ae":
        dim_method = "ae_bayes"
        
    #np.save(f"c:/Users/Vincent/PythonProjects/Thesis/resources/results/forecasts_good/{target}_{dim_method}_{scale_method}_{forecast_method}_h{h}.npy", forecast_spca)
    #np.save(f"c:/Users/Vincent/PythonProjects/Thesis/resources/results/forecasts_good/{target}_AR_h{h}.npy", forecast_ar)
    #np.save(f"c:/Users/Vincent/PythonProjects/Thesis/resources/results/forecasts_good/{target}_actual_h{h}.npy", actual_y)

In [139]:
variables = get_data()
X = variables['data'].values

In [243]:
# Set seed of numpy and torch
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

# Dimension Reduction Hyperparameters
ae_params = {"hidden_dim": np.arange(1, 11, 2),
                "layer_dims": [[], [16], [32], [32, 32], [64, 32]],
                "epochs": [200, 300],
                "update_epochs": [10],
                'batch_size': [32, 64, 100],
                'lr': [0.001, 0.01],
                'update_lr': [0.001, 0.01],
}

rbf_params = {"gamma": 10**np.arange(-6,-1.5,.5),
            "n_components": np.arange(1, 11, 1),
}

sigmoid_params = {
    "gamma": 10**np.arange(-6,-1.5,.5),
    "n_components": np.arange(1, 11, 1)
}

pca_params = {"nfac": np.arange(1, 11, 1)}

# Regression hyperparameters
# TODO: ENTER EXTERTATE HYPERPARAMETERS
krr_params = {
              "kernel": ["rbf"],
              "gamma": [0.3, 0.5, 0.7],
              "alpha": [0.3, 0.5, 0.7],
}

rf_params_grid = {
    "n_estimators": [100, 200],
    "max_depth": [5],
    "max_features": [1/3],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 3],
}

targets = ['inflation', 'unemployment', 'ip_growth']
scale_methods = ['distance_correlation', 'regression']
dim_methods = ['ae', 'rbf', 'sigmoid', 'pca']
dim_methods = ['pca']
forecast_methods = ['rf']

parameters = {'ae': ae_params,
              'pca': pca_params,
              'rbf': rbf_params,
              'sigmoid': sigmoid_params,}

horizons = [1]

# Run the forecasting exercise
for target in targets:
    print("Target: ", target)
    y = variables[target].values
    for h in horizons:
        print("Horizon: ", h)
        for dim_method in dim_methods:
            print("Dimension Reduction: ", dim_method)
            for scale_method in scale_methods:
                print("Scaling Method: ", scale_method)
                for forecast_method in forecast_methods:
                    print("Forecasting Method: ", forecast_method)

                    result = out_sample(
                        X = X,
                        y = y,
                        scale_method=scale_method,
                        dim_method=dim_method,
                        forecast_method=forecast_method,
                        hyper_params_grid=parameters[dim_method],
                        h=h,
                        forecast_params=rf_params_grid,
                        target=target,
                        update_period=1000)

Target:  inflation
Horizon:  1
Dimension Reduction:  pca
Scaling Method:  distance_correlation
Forecasting Method:  rf
Out of sample period 0 out of 228 periods
Number of model configurations:  10


In [205]:
# Set seed of numpy and torch
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

# Dimension Reduction Hyperparameters
ae_params = {"hidden_dim": np.arange(1, 11, 2),
                "layer_dims": [[], [16], [32], [32, 32], [64, 32]],
                "epochs": [200, 300],
                "update_epochs": [10],
                'batch_size': [64, 100],
                'lr': [0.001, 0.01],
                'update_lr': [0.001, 0.01],
}

rbf_params = {"gamma": 10**np.arange(-6,-1.5,.5),
            "n_components": np.arange(1, 11, 1),
}

sigmoid_params = {
    "gamma": 10**np.arange(-6,-1.5,.5),
    "n_components": np.arange(1, 11, 1)
}

pca_params = {"nfac": np.arange(1, 11, 1)}

# Regression hyperparameters
# TODO: ENTER EXTERTATE HYPERPARAMETERS
krr_params = {
              "kernel": ["rbf"],
              "gamma": [0.3, 0.5, 0.7],
              "alpha": [0.3, 0.5, 0.7],
}

rf_params_grid = {
    "n_estimators": [100, 200],
    "max_depth": [5, None],
    "max_features": ["sqrt", "log2"],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 3],
}

targets = ['inflation', 'unemployment', 'ip_growth']
scale_methods = ['distance_correlation', 'regression']
dim_methods = ['ae', 'rbf', 'sigmoid', 'pca']
forecast_methods = ['ols']

parameters = {'ae': ae_params,
              'pca': pca_params,
              'rbf': rbf_params,
              'sigmoid': sigmoid_params,}

horizons = [12]

# Run the forecasting exercise
for target in targets:
    print("Target: ", target)
    y = variables[target].values
    for h in horizons:
        print("Horizon: ", h)
        for dim_method in dim_methods:
            print("Dimension Reduction: ", dim_method)
            for scale_method in scale_methods:
                print("Scaling Method: ", scale_method)
                for forecast_method in forecast_methods:
                    print("Forecasting Method: ", forecast_method)

                    result = out_sample(
                        X = X,
                        y = y,
                        scale_method=scale_method,
                        dim_method=dim_method,
                        forecast_method=forecast_method,
                        hyper_params_grid=parameters[dim_method],
                        h=h,
                        forecast_params=None,
                        target=target,
                        update_period=1000)

Target:  inflation
Horizon:  12
Dimension Reduction:  ae
Scaling Method:  distance_correlation
Forecasting Method:  ols
Out of sample period 0 out of 228 periods
100%|██████████| 100/100 [31:53<00:00, 19.13s/trial, best loss: 2.350099327267766e-06]
{'batch_size': 0, 'dropout': 0.2738930912272174, 'epochs': 0, 'gauss_noise': 0.7360571083061713, 'hidden_dim': 8, 'layer_dims': 2, 'lr': 0, 'update_epochs': 0, 'update_lr': 1}
Best model configuration:  {'batch_size': 32, 'dropout': 0.2738930912272174, 'epochs': 200, 'gauss_noise': 0.7360571083061713, 'hidden_dim': 9, 'layer_dims': [32], 'lr': 0.001, 'update_epochs': 10, 'update_lr': 0.01}
-----------------------------------------------------------------
Initial hyperparameter optimization done
Out of sample period 45 out of 228 periods
Out of sample period 90 out of 228 periods
Out of sample period 135 out of 228 periods
Out of sample period 180 out of 228 periods
MSE_spca:  1.14749e-05 MSE_ar:  1.11619e-05
R2_spca:  -2.8
Scaling Method:  r

In [236]:
ae_params = {
    "hidden_dim": list(np.arange(1, 11, 2)),
    "layer_dims": [[32, 32], [64, 32]],
    "lr": [0.01],
    "epochs": [200],
    "update_epochs": [1],
    "update_lr": [0.001],
    "batch_size": [32, 64, 100],

}

y = variables['unemployment'].values
out_sample(
    X = X,
    y = y,
    scale_method="regression",
    dim_method="rbf",
    forecast_method="ols",
    hyper_params_grid=rbf_params,
    h=12,
    forecast_params=None,
    target="unemployment",
    update_period=1000)

Out of sample period 0 out of 228 periods
Number of model configurations:  90
Best model configuration:  {'gamma': 0.01, 'n_components': 2}
-----------------------------------------------------------------
Initial hyperparameter optimization done
Out of sample period 45 out of 228 periods
Out of sample period 90 out of 228 periods
Out of sample period 135 out of 228 periods
Out of sample period 180 out of 228 periods
MSE_spca:  0.0258633009 MSE_ar:  0.0259924274
R2_spca:  0.4968


In [169]:
pca_params = {"nfac": np.arange(1, 6, 1)}
y = variables['inflation'].values

rbf_params = {"gamma": 10**np.arange(-6,-1.5,.5),
            "n_components": np.arange(1, 7, 1),
}

sigmoid_params = {
    "gamma": 10**np.arange(-6,-1.5,.5),
    "n_components": np.arange(1, 7, 1),
}


ae_params = {
    "hidden_dim": [6],
    "layer_dims": [[]],
    "dropout": [0],
    "gauss_noise": [0.1],
    "lr": [0.01],
    "epochs": [200],
    "update_epochs": [10],
    "update_lr": [0.001]
}

out_sample(
    X = X,
    y = y,
    scale_method="regression",
    dim_method="ae",
    forecast_method="ols",
    hyper_params_grid=ae_params,
    h=6,
    forecast_params=None,
    target="",
    update_period=600)

Out of sample period 0 out of 228 periods
AR forecast: [0.00353078]
Number of model configurations:  1
Best model configuration:  {'hidden_dim': 6, 'layer_dims': [], 'dropout': 0, 'gauss_noise': 0.1, 'lr': 0.01, 'epochs': 200, 'update_epochs': 10, 'update_lr': 0.001}
-----------------------------------------------------------------
Initial hyperparameter optimization done
AR forecast: [0.00383448]
AR forecast: [0.00353257]
AR forecast: [0.0033777]
AR forecast: [0.00347287]
AR forecast: [0.00376862]
AR forecast: [0.00352282]
AR forecast: [0.00317078]
AR forecast: [0.00330988]
AR forecast: [0.00365063]
AR forecast: [0.00306812]
AR forecast: [0.00325079]
AR forecast: [0.00323928]
AR forecast: [0.00342931]
AR forecast: [0.00342526]
AR forecast: [0.00351904]
AR forecast: [0.00366195]
AR forecast: [0.00337421]
AR forecast: [0.00332015]
AR forecast: [0.00345953]
AR forecast: [0.00350451]
AR forecast: [0.00340676]
AR forecast: [0.00345078]
AR forecast: [0.00339994]
AR forecast: [0.00339596]
AR

In [None]:
errors_ar = result['error_ar']
errors_pca = result['error_pca']
errors_spca = result['error_spca']

#errors = pd.DataFrame({'errors_ar': errors_ar, 'errors_pca': errors_pca.flatten()})
#np.set_printoptions(formatter={'all':lambda x: str(x)[:7]})
np.set_printoptions(formatter={'float_kind':'{:f}'.format})
mse_ar = np.mean(errors_ar**2)
mse_pca = np.mean(errors_pca**2)
mse_spca = np.mean(errors_spca**2)

print("MSE AR: ", mse_ar)
print("MSE PCA: ", mse_pca)
print("MSE SPCA: ", mse_spca)