In [88]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from src.helpers.functions import get_data, pc_T, predict_pca, estimate_AR_res, generate_data
from src.helpers.functions import select_AR_lag_SIC, winsor

In [138]:
from src.helpers.regression import linear_reg

def out_sample(X, y):
    h = 1
    T = y.shape[0]

    M = (1984-1959)*12  # In sample periods
    N = T - M  # Out of sample periods

    error_pca = np.zeros((N - h, 1))  # Forecast errors of PCA 
    error_spca = np.zeros((N - h, 1))  # Forecast errors of scaled PCA
    error_ar = np.zeros((N - h, 1))  # Forecast errors of AR model

    p_max = 1  # Number of lags for AR(p) model
    nfac = 5

    # Prepare the regression model
    reg_pc = LinearRegression()
    reg_spc = LinearRegression()

    # Estimate the PCA model
    for n in range(N - h):
        # Use all available data up to time t
        X_train = X[:(M + n), :]
        y_train = y[:M + n]
        y_actual = y[M + n]

        # NOTE: The lag structure for X and y has not been implemented yet at this point of the function
        # Get number lags
        p_AR_star_n = select_AR_lag_SIC(y_train, h, p_max=p_max)
        a_hat, res_h = estimate_AR_res(y_train, h, p_AR_star_n)

        # Compute the forecast error of the AR model
        if p_AR_star_n > 0:
            forecast_ar = a_hat[0] + np.dot(a_hat[1:], y_train[-p_AR_star_n:])
        else:
            print("No AR model selected!")
            forecast_ar = np.mean(y_train)

        error_ar[n] = y_actual - forecast_ar

        # Standardize the data
        X_train = (X_train - np.mean(X_train, axis=0)) / np.std(X_train, axis=0)

        # Initialize the betas for scaling the variables
        beta = np.full(X_train.shape[1], np.nan)
        tstat = np.full(X_train.shape[1], np.nan)
        
        # Compute the betas for scaling the variables
        for j in range(X_train.shape[1]):
            parm, std_err, t_stat, reg_se, adj_r2, bic = linear_reg(y_train[h:], X_train[:-h, j].reshape(-1, 1), constant=1, nlag=h)
            beta[j] = parm[1]

        # Winsorizing should be done at (0, 90)
        beta_win = winsor(np.abs(beta), p=(0, 90))
        scaleX_train = np.zeros(X_train.shape)

        # Scale the factors by the winsorized betas
        for j in range(X_train.shape[1]):
            scaleX_train[:, j] = X_train[:, j] * beta_win[j]

        # Compute the principal components
        _, x_pc, _, _, _ = pc_T(X_train, nfac)
        _, x_spc, _, _, _ = pc_T(scaleX_train, nfac)

        # Estimate regression coefficients
        reg_pc.fit(x_pc[:-h], y_train[h:])
        reg_spc.fit(x_spc[:-h], y_train[h:])
        
        # Compute the forecast
        y_hat_pca = reg_pc.predict(x_pc[-1].reshape(1, -1))
        y_hat_spca = reg_spc.predict(x_spc[-1].reshape(1, -1))

        # Compute the forecast error
        error_pca[n] = y_actual - y_hat_pca
        error_spca[n] = y_actual - y_hat_spca

        print(f"Run {n} Current SSE of pca: ", np.sum(error_pca**2))

    # Compute the R squared out of sample against the AR model
    SSE_pca = np.sum(error_pca**2)
    SSE_spca = np.sum(error_spca**2)
    SSE_ar = np.sum(error_ar**2)

    R2_spca = (1 - SSE_spca / SSE_ar)
    R2_pca = (1 - SSE_pca / SSE_ar)

    print("R2_spca: ", R2_spca)
    print("R2_pca: ", R2_pca)
    
    return {"error_pca": error_pca, "error_spca": error_spca, "error_ar": error_ar, "R2_spca": R2_spca, "R2_pca": R2_pca}

In [139]:
variables = get_data()
X = variables['data']
y = variables['inflation']

print("Shape of X: ", X.shape, "Shape of y: ", y.shape)

[*********************100%***********************]  1 of 1 completed
Shape of X:  (720, 123) Shape of y:  (720,)


In [140]:
# Run the forecasting exercise
result = out_sample(X = X.values, y = y)

Run 0 Current SSE of pca:  2.0817572955371303e-07
Run 1 Current SSE of pca:  8.56187588730542e-06
Run 2 Current SSE of pca:  1.434810999811504e-05
Run 3 Current SSE of pca:  1.6120606666127634e-05
Run 4 Current SSE of pca:  1.6594252743918073e-05
Run 5 Current SSE of pca:  1.733370305730119e-05
Run 6 Current SSE of pca:  1.7525932398221622e-05
Run 7 Current SSE of pca:  1.951162406371568e-05
Run 8 Current SSE of pca:  2.165599014131261e-05
Run 9 Current SSE of pca:  2.1823883460421333e-05
Run 10 Current SSE of pca:  2.241612159943014e-05
Run 11 Current SSE of pca:  2.645846893658633e-05
Run 12 Current SSE of pca:  2.6598939499759038e-05
Run 13 Current SSE of pca:  5.8124920549136906e-05
Run 14 Current SSE of pca:  0.00012778632077915344
Run 15 Current SSE of pca:  0.00016208237029873926
Run 16 Current SSE of pca:  0.00016208383798121246
Run 17 Current SSE of pca:  0.00016221607668020526
Run 18 Current SSE of pca:  0.00016647007955652868
Run 19 Current SSE of pca:  0.0001708076544296672

In [124]:
errors_ar = result['error_ar']
errors_pca = result['error_pca']
errors_spca = result['error_spca']

#errors = pd.DataFrame({'errors_ar': errors_ar, 'errors_pca': errors_pca.flatten()})
np.set_printoptions(formatter={'all':lambda x: str(x)[:7]})

sse_ar = np.sum(errors_ar**2)
sse_pca = np.sum(errors_pca**2)
sse_spca = np.sum(errors_spca**2)

print(errors_ar)

[[0.00059]
 [0.00436]
 [0.00082]
 [-0.0013]
 [0.00059]
 [0.00151]
 [-4.9798]
 [0.00058]
 [0.00058]
 [0.00242]
 [0.00207]
 [0.00143]
 [0.00051]
 [-0.0043]
 [-0.0042]
 [0.00013]
 [0.00531]
 [0.00176]
 [-0.0016]
 [0.00028]
 [0.00301]
 [-0.0006]
 [0.00056]
 [0.00236]
 [0.00291]
 [-0.0001]
 [0.00110]
 [0.00199]
 [-0.0003]
 [0.00258]
 [-0.0003]
 [0.00257]
 [0.00048]
 [0.00021]
 [0.00168]
 [-0.0006]
 [0.00226]
 [-0.0006]
 [0.00139]
 [0.00422]
 [-0.0015]
 [0.00249]
 [0.00132]
 [0.00132]
 [0.00131]
 [0.00047]
 [0.00104]
 [0.00104]
 [0.00186]
 [0.00047]
 [0.00267]
 [0.00397]
 [-0.0001]
 [-8.6823]
 [0.00101]
 [-0.0021]
 [0.00240]
 [0.00315]
 [0.00071]
 [0.00046]
 [0.00729]
 [-0.0025]
 [0.00203]
 [-0.0008]
 [-1.8481]
 [0.00513]
 [0.00046]
 [0.00530]
 [0.00117]
 [0.00219]
 [-0.0023]
 [0.00222]
 [0.00122]
 [-0.0017]
 [-0.0004]
 [0.00222]
 [0.00220]
 [0.00047]
 [-0.0005]
 [0.00194]
 [0.00095]
 [-0.0005]
 [0.00338]
 [-2.1186]
 [-0.0012]
 [0.00168]
 [0.00215]
 [-0.0002]
 [0.00070]
 [0.00141]
 [0.00093]