In [88]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from src.helpers.functions import get_data, pc_T, predict_pca, estimate_AR_res, generate_data
from src.helpers.functions import select_AR_lag_SIC, winsor

In [117]:
from src.helpers.regression import linear_reg

def out_sample(X, y):
    h = 1
    T = y.shape[0]

    M = (1984-1959)*12  # In sample periods
    N = T - M  # Out of sample periods

    error_pca = np.zeros((N - h, 1))  # Forecast errors of PCA 
    error_spca = np.zeros((N - h, 1))  # Forecast errors of scaled PCA
    error_ar = np.zeros((N - h, 1))  # Forecast errors of AR model

    p_max = 1  # Number of lags for AR(p) model
    nfac = 5

    # Prepare the regression model
    reg_pc = LinearRegression()
    reg_spc = LinearRegression()

    # Estimate the PCA model
    for n in range(N - h):
        # Use all available data up to time t
        X_train = X[:(M + n), :]
        y_train = y[:M + n]
        y_actual = y[M + n]

        # NOTE: The lag structure for X and y has not been implemented yet

        # Get number lags
        p_AR_star_n = select_AR_lag_SIC(y_train, h, p_max=p_max)
        a_hat, res_h = estimate_AR_res(y_train, h, p_AR_star_n)

        # Compute the forecast error of the AR model
        if p_AR_star_n > 0:
            forecast_ar = a_hat[0] + np.dot(a_hat[1:], y_train[-p_AR_star_n:])
        else:
            print("No AR model selected!")
            forecast_ar = np.mean(y_train)

        error_ar[n] = y_actual - forecast_ar

        # Standardize the data
        X_train = (X_train - np.mean(X_train, axis=0)) / np.std(X_train, axis=0)

        # Initialize the betas for scaling the variables
        beta = np.full(X_train.shape[1], np.nan)
        tstat = np.full(X_train.shape[1], np.nan)
        
        # Compute the betas for scaling the variables
        for j in range(X_train.shape[1]):
            #lm = lr.fit(Zs[:-h, j].reshape(-1, 1), y_h[1:])
            #parm = lm.coef_
            parm, std_err, t_stat, reg_se, adj_r2, bic = linear_reg(y_train[h:], X_train[:-h, j].reshape(-1, 1), constant=1, nlag=h)
            beta[j] = parm[1]

        # Winsorizing should be done at (0, 90)
        beta_win = winsor(np.abs(beta), p=(0, 90))
        scaleX_train = np.zeros(X_train.shape)

        # Scale the factors by the winsorized betas
        for j in range(X_train.shape[1]):
            scaleX_train[:, j] = X_train[:, j] * beta_win[j]

        # Compute the principal components
        _, x_pc, _, _, _ = pc_T(X_train, nfac)
        _, x_spc, _, _, _ = pc_T(scaleX_train, nfac)

        # Estimate regression coefficients
        reg_pc.fit(x_pc[:-h], y_train[h:])
        reg_spc.fit(x_spc[:-h], y_train[h:])
        
        # Compute the forecast
        y_hat_pca = reg_pc.predict(x_pc[-1].reshape(1, -1))
        y_hat_spca = reg_spc.predict(x_spc[-1].reshape(1, -1))

        # Compute the forecast error
        error_pca[n] = y_actual - y_hat_pca
        error_spca[n] = y_actual - y_hat_spca

        print("Current SSE: ", np.sum(error_spca**2))

    # Compute the R squared out of sample against the AR model
    SSE_pca = np.sum(error_pca**2)
    SSE_spca = np.sum(error_spca**2)
    SSE_ar = np.sum(error_ar**2)

    R2_spca = 100*(1 - SSE_spca / SSE_ar)
    R2_pca = 100*(1 - SSE_pca / SSE_ar)

    print("R2_spca: ", R2_spca)
    print("R2_pca: ", R2_pca)
    
    return {"error_pca": error_pca, "error_spca": error_spca, "error_ar": error_ar, "R2_spca": R2_spca, "R2_pca": R2_pca}

In [118]:
variables = get_data()
data = variables['data']
inflation = variables['inflation']

result = out_sample(X = data.values, y = inflation)

[*********************100%***********************]  1 of 1 completed
Current SSE:  2.3643489363366063e-07
Current SSE:  5.518274333485011e-06
Current SSE:  6.542238941257539e-06
Current SSE:  1.004406678028683e-05
Current SSE:  1.0289428973837464e-05
Current SSE:  1.0525476441518869e-05
Current SSE:  1.0538835154772175e-05
Current SSE:  1.733599805682817e-05
Current SSE:  2.2929708898034197e-05
Current SSE:  2.297533809497555e-05
Current SSE:  2.363132631408236e-05
Current SSE:  2.4465678738224293e-05
Current SSE:  2.4702459361899364e-05
Current SSE:  6.559073861051859e-05
Current SSE:  0.0001424417444706118
Current SSE:  0.0001770653154043072
Current SSE:  0.00017709114871292808
Current SSE:  0.00017715613063677357
Current SSE:  0.00018509362076745106
Current SSE:  0.00018752371012260244
Current SSE:  0.00018911773178284869
Current SSE:  0.0001898316200249181
Current SSE:  0.00019088788066655413
Current SSE:  0.000191064555725722
Current SSE:  0.00019256232369114682
Current SSE:  0.00

In [122]:
errors_ar = result['error_ar']
errors_pca = result['error_pca']
errors_spca = result['error_spca']

# Put erros_ar and errors_pca into a dataframe
errors_ar = errors_ar[:-1]

#errors = pd.DataFrame({'errors_ar': errors_ar, 'errors_pca': errors_pca.flatten()})
np.set_printoptions(formatter={'all':lambda x: str(x)[:7]})

sse_ar = np.sum(errors_ar**2)
sse_pca = np.sum(errors_pca**2)
sse_spca = np.sum(errors_spca**2)

sse_spca / sse_ar

1- sse_spca / sse_ar

0.10166655774121325