In [325]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from src.helpers.functions import get_data, pc_T, predict_pca, estimate_AR_res, generate_data
from src.helpers.functions import select_AR_lag_SIC, winsor, lag_matrix
from src.helpers.ardl_multi import ARDL_multi
from src.helpers.regression import linear_reg
from src.helpers.refactored import ar_forecast


%reload_ext autoreload
%autoreload 2

In [349]:
def out_sample(X, y):
    h = 1
    T = y.shape[0]

    M = (1984-1959)*12  # In sample periods
    N = T - M  # Out of sample periods

    forecast_pca = np.zeros((N, 1))  # Forecast errors of PCA 
    forecast_spca = np.zeros((N, 1))  # Forecast errors of scaled PCA
    forecast_ar = np.zeros((N, 1))  # Forecast errors of AR model
    actual_y = np.zeros((N, 1))  # Actual values of y

    # Initialize the regression models
    reg_pc = LinearRegression()
    reg_spc = LinearRegression()
    lr = LinearRegression()

    p_max = 1  # Max number of lags for AR(p) model
    nfac = 5

    # Loop over all out of sample periods
    for n in range(N):
        print("Forecast {} out of {}".format(n, N))

        # Use all available data up to time t
        X_t = X[:(M + n), :]
        y_t = y[:M + n]
        actual_y[n] = y[M + n]

        # Standardize the data
        X_t = (X_t - np.mean(X_t, axis=0)) / np.std(X_t, axis=0)

        # Get number lags
        p_AR_star_n = select_AR_lag_SIC(y_t, h, p_max=p_max)

        # Compute the forecast of the AR model
        forecast_ar[n] = ar_forecast(10, y_t, h)
        
        #### STEP 1: Scaling factors ####

        # Initialize the betas for scaling the variables
        beta = np.full(X_t.shape[1], np.nan)
        
        # Compute the betas for scaling the variables
        for j in range(X_t.shape[1]):
            lr.fit(X_t[:-h, j].reshape(-1, 1), y_t[h:])
            beta[j] = lr.coef_[0]

        # Winsorizing the betas
        beta_win = winsor(np.abs(beta), p=(10, 90))

        # Scale the factors by the winsorized betas
        scaleX_t = np.zeros(X_t.shape)

        for j in range(X_t.shape[1]):
            scaleX_t[:, j] = X_t[:, j] * beta_win[j]

        #### STEP 2: Dimension Reduction ####

        # Compute the principal components
        _, x_pc, _, _, _ = pc_T(X_t, nfac)
        _, x_spc, _, _, _ = pc_T(scaleX_t, nfac)
        
        #### STEP 3: Forecasting ####

        # Add lag of y_t to the factors
        if p_AR_star_n > 0:
            x_pc = lag_matrix(x_pc, y_t, p_AR_star_n)
            x_spc = lag_matrix(x_spc, y_t, p_AR_star_n)
            y_t = y_t[p_AR_star_n-1:]
            
        # Estimate regression coefficients
        reg_pc.fit(x_pc[:-h], y_t[h:])
        reg_spc.fit(x_spc[:-h], y_t[h:])
        
        # Compute the forecast of the PCA and scaled PCA model
        forecast_pca[n] = reg_pc.predict(x_pc[-1].reshape(1, -1))
        forecast_spca[n] = reg_spc.predict(x_spc[-1].reshape(1, -1))

    # Compute the forecast errors
    error_pca = actual_y - forecast_pca
    error_spca = actual_y - forecast_spca
    error_ar = actual_y - forecast_ar
    
    # Compute the R squared out of sample against the AR model
    SSE_pca = np.sum(error_pca**2)
    SSE_spca = np.sum(error_spca**2)
    SSE_ar = np.sum(error_ar**2)

    R2_spca = (1 - SSE_spca / SSE_ar)
    R2_pca = (1 - SSE_pca / SSE_ar)

    print("R2_spca: ", round(R2_spca * 100, 2))
    print("R2_pca: ", round(R2_pca* 100, 2))
    print("SSE PCA: ", SSE_pca, "SSE SPCA: ", SSE_spca, "SSE AR: ", SSE_ar)
    
    return {"error_pca": error_pca,
            "error_spca": error_spca,
            "error_ar": error_ar,
            "R2_spca": R2_spca,
            "R2_pca": R2_pca}

In [347]:
variables = get_data()
X = variables['data'].values
y = variables['unemployment'].values

print("Shape of X: ", X.shape, "Shape of y: ", y.shape)

[*********************100%***********************]  1 of 1 completed
Shape of X:  (720, 123) Shape of y:  (720,)


In [350]:
# Run the forecasting exercise
result = out_sample(X = X, y = y)

Forecast 0 out of 420
Forecast 1 out of 420
Forecast 2 out of 420
Forecast 3 out of 420
Forecast 4 out of 420
Forecast 5 out of 420
Forecast 6 out of 420
Forecast 7 out of 420
Forecast 8 out of 420
Forecast 9 out of 420
Forecast 10 out of 420
Forecast 11 out of 420
Forecast 12 out of 420
Forecast 13 out of 420
Forecast 14 out of 420
Forecast 15 out of 420
Forecast 16 out of 420
Forecast 17 out of 420
Forecast 18 out of 420
Forecast 19 out of 420
Forecast 20 out of 420
Forecast 21 out of 420
Forecast 22 out of 420
Forecast 23 out of 420
Forecast 24 out of 420
Forecast 25 out of 420
Forecast 26 out of 420
Forecast 27 out of 420
Forecast 28 out of 420
Forecast 29 out of 420
Forecast 30 out of 420
Forecast 31 out of 420
Forecast 32 out of 420
Forecast 33 out of 420
Forecast 34 out of 420
Forecast 35 out of 420
Forecast 36 out of 420
Forecast 37 out of 420
Forecast 38 out of 420
Forecast 39 out of 420
Forecast 40 out of 420
Forecast 41 out of 420
Forecast 42 out of 420
Forecast 43 out of 42

### Using data got from FRED myself

#### IP Growth with regular regression
R2_spca:  0.15204624361844543
R2_pca:  0.08839576318473485

#### Inflation with regular regression
R2_spca:  0.10187410329181379
R2_pca:  -0.010082097586632255

#### Unemployment with regular regression
R2_spca:  0.12376440673071476
R2_pca:  0.08471787659274999

### Using data from Matlab

#### Inflation with regular regression
R2_spca:  0.10457994917552516
R2_pca:  -0.002606340261798845

#### Inflation with ARDL
R2_spca:  0.1183403234003777
R2_pca:  0.04379090139180419

#### IP Growth with ARDL
R2_spca:  0.05743711727643541
R2_pca:  0.05442432341881198

#### IP Growth with regular regression
R2_spca:  0.15276614073969763
R2_pca:  0.08666548308729716

In [271]:
errors_ar = result['error_ar']
errors_pca = result['error_pca']
errors_spca = result['error_spca']

#errors = pd.DataFrame({'errors_ar': errors_ar, 'errors_pca': errors_pca.flatten()})
np.set_printoptions(formatter={'all':lambda x: str(x)[:7]})

mse_ar = np.mean(errors_ar**2)
mse_pca = np.mean(errors_pca**2)
mse_spca = np.mean(errors_spca**2)

print("MSE AR: ", mse_ar)
print("MSE PCA: ", mse_pca)
print("MSE SPCA: ", mse_spca)

MSE AR:  3.710772719610096e-05
MSE PCA:  3.418341654922564e-05
MSE SPCA:  3.220099220521522e-05
