In [165]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from src.helpers.functions import get_data, pc_T, predict_pca, estimate_AR_res, generate_data
from src.helpers.functions import select_AR_lag_SIC, winsor
from src.helpers.ardl_multi import ARDL_multi
from src.helpers.regression import linear_reg


%reload_ext autoreload
%autoreload 2

In [169]:
def out_sample(X, y):
    h = 1
    T = y.shape[0]

    M = (1984-1959)*12  # In sample periods
    N = T - M  # Out of sample periods

    forecast_pca = np.zeros((N, 1))  # Forecast errors of PCA 
    forecast_spca = np.zeros((N, 1))  # Forecast errors of scaled PCA
    forecast_ar = np.zeros((N, 1))  # Forecast errors of AR model
    actual_y = np.zeros((N, 1))  # Actual values of y

    # Initialize the regression models
    reg_pc = LinearRegression()
    reg_spc = LinearRegression()

    p_max = 1  # Max number of lags for AR(p) model
    nfac = 5

    # Loop over all out of sample periods
    for n in range(N):
        print("Forecast n {}".format(n))

        # Use all available data up to time t
        X_t = X[:(M + n), :]
        y_t = y[:M + n]
        actual_y[n] = y[M + n]

        # Standardize the data
        X_t = (X_t - np.mean(X_t, axis=0)) / np.std(X_t, axis=0)

        # Get number lags
        p_AR_star_n = select_AR_lag_SIC(y_t, h, p_max=p_max)

        # Compute the forecast of the AR model
        if p_AR_star_n > 0:
            a_hat, _ = estimate_AR_res(y_t, h, p_AR_star_n)
            forecast_ar[n] = a_hat[0] + np.dot(a_hat[1:], y_t[-p_AR_star_n:])
        else:
            forecast_ar[n] = np.mean(y_t)

        # Initialize the betas for scaling the variables
        beta = np.full(X_t.shape[1], np.nan)
        
        # Compute the betas for scaling the variables
        for j in range(X_t.shape[1]):
            parm, std_err, t_stat, reg_se, adj_r2, bic = linear_reg(y_t[h:], X_t[:-h, j].reshape(-1, 1), constant=1, nlag=p_AR_star_n)
            beta[j] = parm[1]

        # Winsorizing the betas
        beta_win = winsor(np.abs(beta), p=(0, 90))

        # Scale the factors by the winsorized betas
        scaleX_t = np.zeros(X_t.shape)
        for j in range(X_t.shape[1]):
            scaleX_t[:, j] = X_t[:, j] * beta_win[j]

        # Compute the principal components
        _, x_pc, _, _, _ = pc_T(X_t, nfac)
        _, x_spc, _, _, _ = pc_T(scaleX_t, nfac)

        """ 
        # Estimate the ARDL model
        p_ARDL = (p_AR_star_n, 1)
        y_n_last =  y_t[-p_AR_star_n:]

        if p_AR_star_n > 0:
            c_hat_n_pca = ARDL_multi(y=y_t[h:], z=x_pc[:-h], h=h, p=p_ARDL)
            c_hat_n_spca = ARDL_multi(y=y_t[h:], z=x_spc[:-h], h=h, p=p_ARDL)

            y_hat_pca = np.concatenate(([1], y_n_last, x_pc[-1])) @ c_hat_n_pca
            y_hat_spca = np.concatenate(([1], y_n_last, x_spc[-1])) @ c_hat_n_spca
        else:
            c_hat_n_pca = linear_reg(y_t[h:], x_pc[:-h], constant=1, nlag=h)[0]
            c_hat_n_spca = linear_reg(y_t[h:], x_spc[:-h], constant=1, nlag=h)[0]

            y_hat_pca = np.concatenate(([1], x_pc[-1])) @ c_hat_n_pca
            y_hat_spca = np.concatenate(([1], x_spc[-1])) @ c_hat_n_spca
        """

        # Add lag of y_t to the factors
        if p_AR_star_n > 0:
            # NOTE: This only adds 1 lag. If pmax > 1 need to adjust
            x_pc  = np.insert(x_pc, nfac, y_t, axis=1)
            x_spc = np.insert(x_spc, nfac, y_t, axis=1)

        # Estimate regression coefficients
        reg_pc.fit(x_pc[:-h], y_t[h:])
        reg_spc.fit(x_spc[:-h], y_t[h:])
        
        # Compute the forecast of the PCA and scaled PCA model
        forecast_pca[n] = reg_pc.predict(x_pc[-1].reshape(1, -1))
        forecast_spca[n] = reg_spc.predict(x_spc[-1].reshape(1, -1))

    # Compute the forecast errors
    error_pca = actual_y - forecast_pca
    error_spca = actual_y - forecast_spca
    error_ar = actual_y - forecast_ar
    
    # Compute the R squared out of sample against the AR model
    SSE_pca = np.sum(error_pca**2)
    SSE_spca = np.sum(error_spca**2)
    SSE_ar = np.sum(error_ar**2)

    R2_spca = (1 - SSE_spca / SSE_ar)
    R2_pca = (1 - SSE_pca / SSE_ar)

    print("R2_spca: ", round(R2_spca * 100, 2))
    print("R2_pca: ", round(R2_pca* 100, 2))

    print("SSE PCA: ", SSE_pca, "SSE SPCA: ", SSE_spca, "SSE AR: ", SSE_ar)
    
    return {"error_pca": error_pca,
            "error_spca": error_spca,
            "error_ar": error_ar,
            "R2_spca": R2_spca,
            "R2_pca": R2_pca}

In [170]:
variables = get_data()
X = variables['data'].values
y = variables['unemployment'].values

print("Shape of X: ", X.shape, "Shape of y: ", y.shape)

Shape of X:  (720, 123) Shape of y:  (720,)


In [171]:
# Run the forecasting exercise
result = out_sample(X = X, y = y)

Forecast n 0
Forecast n 1
Forecast n 2
Forecast n 3
Forecast n 4
Forecast n 5
Forecast n 6
Forecast n 7
Forecast n 8
Forecast n 9
Forecast n 10
Forecast n 11
Forecast n 12
Forecast n 13
Forecast n 14
Forecast n 15
Forecast n 16
Forecast n 17
Forecast n 18
Forecast n 19
Forecast n 20
Forecast n 21
Forecast n 22
Forecast n 23
Forecast n 24
Forecast n 25
Forecast n 26
Forecast n 27
Forecast n 28
Forecast n 29
Forecast n 30
Forecast n 31
Forecast n 32
Forecast n 33
Forecast n 34
Forecast n 35
Forecast n 36
Forecast n 37
Forecast n 38
Forecast n 39
Forecast n 40
Forecast n 41
Forecast n 42
Forecast n 43
Forecast n 44
Forecast n 45
Forecast n 46
Forecast n 47
Forecast n 48
Forecast n 49
Forecast n 50
Forecast n 51
Forecast n 52
Forecast n 53
Forecast n 54
Forecast n 55
Forecast n 56
Forecast n 57
Forecast n 58
Forecast n 59
Forecast n 60
Forecast n 61
Forecast n 62
Forecast n 63
Forecast n 64
Forecast n 65
Forecast n 66
Forecast n 67
Forecast n 68
Forecast n 69
Forecast n 70
Forecast n 71
Fo

### Using data got from FRED myself

#### IP Growth with regular regression
R2_spca:  0.15204624361844543
R2_pca:  0.08839576318473485

#### Inflation with regular regression
R2_spca:  0.10187410329181379
R2_pca:  -0.010082097586632255

#### Unemployment with regular regression
R2_spca:  0.12376440673071476
R2_pca:  0.08471787659274999

### Using data from Matlab

#### Inflation with regular regression
R2_spca:  0.10457994917552516
R2_pca:  -0.002606340261798845

#### Inflation with ARDL
R2_spca:  0.1183403234003777
R2_pca:  0.04379090139180419

#### IP Growth with ARDL
R2_spca:  0.05743711727643541
R2_pca:  0.05442432341881198

#### IP Growth with regular regression
R2_spca:  0.15276614073969763
R2_pca:  0.08666548308729716

In [115]:
errors_ar = result['error_ar']
errors_pca = result['error_pca']
errors_spca = result['error_spca']

#errors = pd.DataFrame({'errors_ar': errors_ar, 'errors_pca': errors_pca.flatten()})
np.set_printoptions(formatter={'all':lambda x: str(x)[:7]})

mse_ar = np.mean(errors_ar**2)
mse_pca = np.mean(errors_pca**2)
mse_spca = np.mean(errors_spca**2)

print("MSE AR: ", mse_ar)
print("MSE PCA: ", mse_pca)
print("MSE SPCA: ", mse_spca)

MSE AR:  4.2139465760574156e-05
MSE PCA:  4.666149746287407e-05
MSE SPCA:  4.605713003599992e-05
