In [517]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from src.helpers.functions import get_data, pc_T, predict_pca, estimate_AR_res, generate_data
from src.helpers.functions import select_AR_lag_SIC, winsor, lag_matrix
from src.helpers.ardl_multi import ARDL_multi
from src.helpers.regression import linear_reg
from src.helpers.refactored import ar_forecast, scale_X, reduce_dims, loocv_ts


%reload_ext autoreload
%autoreload 2

In [536]:
def out_sample(X, y):
    h = 1
    T = y.shape[0]

    M = (1984-1959)*12  # In sample periods
    N = T - M  # Out of sample periods

    forecast_pca = np.zeros((N, 1))  # Forecast errors of PCA 
    forecast_spca = np.zeros((N, 1))  # Forecast errors of scaled PCA
    forecast_ar = np.zeros((N, 1))  # Forecast errors of AR model
    actual_y = np.zeros((N, 1))  # Actual values of y

    # Initialize the regression models
    reg_pc = LinearRegression()
    reg_spc = LinearRegression()

    p_max = 1  # Max number of lags for AR(p) model
    nfac = 5
    scale_method = "regression"

    # Loop over all out of sample periods
    for n in range(N):
        #print("Forecast {} out of {}".format(n, N))

        # Use all available data up to time t
        X_t = X[:(M + n), :]
        y_t = y[:M + n]
        actual_y[n] = y[M + n]

        # Standardize the data
        X_t = (X_t - np.mean(X_t, axis=0)) / np.std(X_t, axis=0)

        # Get number lags
        p_AR_star_n = select_AR_lag_SIC(y_t, h, p_max=p_max)

        # Compute the forecast of the AR model
        forecast_ar[n] = ar_forecast(p_AR_star_n, y_t, h)
        
        #### STEP 1: Scaling factors ####

        # Compute the betas for scaling the variables
        beta = scale_X(X_t, y_t, h, method=scale_method, p_AR_star_n=p_AR_star_n)
        
        # Winsorizing the betas
        beta_win = winsor(np.abs(beta), p=(0, 90))

        # Scale the factors by the winsorized betas
        scaleX_t = X_t * beta_win

        #### Intermezzo: Find the optimal number of factors ####
        
        #if n == 0:
        #    nfac = loocv_ts(X=X_t, y=y_t, h=h, p_AR_star_n=p_AR_star_n, method="pca")
        
        #### STEP 2: Dimension Reduction ####
        # Compute the principal components
        x_pc = reduce_dims(X=X_t, nfac=nfac, method="pca")
        x_spc = reduce_dims(X=scaleX_t, nfac=nfac, method="kpca")

        #### STEP 3: Forecasting ####

        # Add lag of y_t to the factors
        if p_AR_star_n > 0:
            x_pc = lag_matrix(x_pc, y_t, p_AR_star_n)
            x_spc = lag_matrix(x_spc, y_t, p_AR_star_n)
            y_t = y_t[p_AR_star_n-1:]
            
        # Estimate regression coefficients
        reg_pc.fit(x_pc[:-h], y_t[h:])
        reg_spc.fit(x_spc[:-h], y_t[h:])
        
        # Compute the forecast of the PCA and scaled PCA model
        forecast_pca[n] = reg_pc.predict(x_pc[-1].reshape(1, -1))
        forecast_spca[n] = reg_spc.predict(x_spc[-1].reshape(1, -1))

    # Compute the forecast errors
    error_pca = actual_y - forecast_pca
    error_spca = actual_y - forecast_spca
    error_ar = actual_y - forecast_ar
    
    # Compute the R squared out of sample against the AR model
    SSE_pca = np.sum(error_pca**2)
    SSE_spca = np.sum(error_spca**2)
    SSE_ar = np.sum(error_ar**2)

    R2_spca = (1 - SSE_spca / SSE_ar)
    R2_pca = (1 - SSE_pca / SSE_ar)

    print("R2_spca: ", round(R2_spca * 100, 2))
    print("R2_pca: ", round(R2_pca* 100, 2))
    
    return {"error_pca": error_pca,
            "error_spca": error_spca,
            "error_ar": error_ar,
            "R2_spca": R2_spca,
            "R2_pca": R2_pca}

In [None]:
variables = get_data()
X = variables['data'].values
y = variables['unemployment'].values

print("Shape of X: ", X.shape, "Shape of y: ", y.shape)

In [537]:
# Run the forecasting exercise
for target in variables.keys():
    if target == 'data':
        continue
    print("Target: ", target)
    y = variables[target].values
    result = out_sample(X = X, y = y)

Target:  inflation
R2_spca:  6.34
R2_pca:  -14.11
Target:  unemployment


KeyboardInterrupt: 

#### Regression scaling, kpca RBF
Target:  inflation 
R2_spca:  12.39
R2_pca:  -0.76
Target:  unemployment
R2_spca:  17.22
R2_pca:  10.12
Target:  ip_growth
R2_spca:  13.22
R2_pca:  7.88
Target:  volatility
R2_spca:  5.4
R2_pca:  1.13

#### Distance correlation scaling, kpca RBF
Target:  inflation
R2_spca:  9.03
R2_pca:  -0.76
Target:  unemployment
R2_spca:  17.59
R2_pca:  10.12
Target:  ip_growth
R2_spca:  10.88
R2_pca:  7.88
Target:  volatility
R2_spca:  5.66
R2_pca:  1.13

#### Distance correlation scaling, kpca poly(5)
Target:  inflation
R2_spca:  9.73
R2_pca:  -0.76
Target:  unemployment
R2_spca:  18.86
R2_pca:  10.12
Target:  ip_growth
R2_spca:  12.63
R2_pca:  7.88
Target:  volatility
R2_spca:  4.63
R2_pca:  1.13




In [None]:
errors_ar = result['error_ar']
errors_pca = result['error_pca']
errors_spca = result['error_spca']

#errors = pd.DataFrame({'errors_ar': errors_ar, 'errors_pca': errors_pca.flatten()})
#np.set_printoptions(formatter={'all':lambda x: str(x)[:7]})
np.set_printoptions(formatter={'float_kind':'{:f}'.format})
mse_ar = np.mean(errors_ar**2)
mse_pca = np.mean(errors_pca**2)
mse_spca = np.mean(errors_spca**2)

print("MSE AR: ", mse_ar)
print("MSE PCA: ", mse_pca)
print("MSE SPCA: ", mse_spca)