In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

from functions import winsor, sPCAest
from scipy.ndimage import shift
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
def generate_data(n, T, N, h_steps=1):
    # Generate y_{t + h} = g_t + e_{t + h}
    # e_{t + h} ~ N(0, 1)
    # g_t ~ N(0, 1)
    # X_t,i = g_t*phi_i + h_t*psi_i + u_t,i

    # Set T to T + h to account for lag
    T_plus_h = T + h_steps

    # Generate g_t
    g = np.random.normal(0, 1, T_plus_h)

    # Generate h_t
    h = np.random.normal(0, 1, T_plus_h)

    # Generate e_t+h
    e = np.random.normal(0, 1, T_plus_h)

    # Generate y_{t+h}
    y_h = g + e

    # Generate y_t
    y_t = np.zeros(T_plus_h)
    for i in range(h_steps, T_plus_h):
        y_t[i] = y_h[i - h_steps]

    # Generate phi_i (Nx1) with n < N nonzero elements
    phi = np.zeros(N)
    phi[:n] = np.random.uniform(0, 1, n)

    # Generate psi_i (Nx1) with n < N nonzero elements
    psi = np.zeros(N)
    psi[:n] = np.random.uniform(0, 1, n)

    # Generate idiosyncratic error's variances
    variance_u = np.random.uniform(0, 1, N)

    # Generate covariance matrix for u_t
    sigma_u = np.diag(variance_u)

    # Generate u
    u = np.random.multivariate_normal(np.zeros(N), sigma_u, T)

    # Drop first h rows
    y_h = y_h[h_steps:]
    y_t = y_t[h_steps:]
    e = e[h_steps:]
    g = g[h_steps:]
    h = h[h_steps:]

    # Generate X_t
    X = np.zeros((T, N))
    for i in range(N):
        X[:, i] = g*phi[i] + h*psi[i] + u[:, i]

    results = {'y_t':y_t,
               'y_h':y_h,
                'X':X,
                'g':g,
                'e':e,
                }
    
    return results

# Testing
results = generate_data(10, T=250, N=500, h_steps=1)
y_t = results['y_t']
y_h = results['y_h']
X = results['X']
e = results['e']
g = results['g']

print(y_t.shape)
print(y_h.shape)
print(X.shape)

print('y_t: ', y_t[:10])
print('y_h: ', y_h[:10])
print()
print('e: ', e[:10])
print('g: ', g[:10])


(250,)
(250,)
(250, 500)
y_t:  [ 1.37812073  2.40511848 -0.80551248 -0.57257206 -0.68747982  2.18278669
  0.75645577 -1.44036312 -1.00839661  0.70745018]
y_h:  [ 2.40511848 -0.80551248 -0.57257206 -0.68747982  2.18278669  0.75645577
 -1.44036312 -1.00839661  0.70745018  2.00432616]

e:  [ 2.37833222 -0.78314739 -0.76014263  0.39733133  1.61669018  0.12103457
 -1.18397144 -0.91833556  1.53073035  0.7577641 ]
g:  [ 0.02678626 -0.0223651   0.18757057 -1.08481115  0.56609652  0.6354212
 -0.25639168 -0.09006105 -0.82328017  1.24656206]


In [65]:
def simulate_PCA(n=10, T=250, N=500, h_steps=1, R=10, nfac=2, method="sPCA"):
    # Initialize the MSE vector
    error_mat = np.zeros((R, 50 - h_steps - 1))

    for r in range(R):
        variables = generate_data(n=n, T=T, h_steps=h_steps, N=N)

        # y_t contains the values of y at time t, and y_h contains the values of y at time t+h
        # X contains the values of X at time t
        y_h = variables['y_h']
        y_t = variables['y_t']
        X = variables['X']


        # Initialize the error vector
        error = np.zeros((50 - h_steps - 1, 1))
        
        # Loop for expanding window estimation
        # The initial window size is 200 observations
        # The window size is increased by 1 observation at each iteration
        # The window size is increased until it reaches 250 observations

        for t in range(50 - h_steps - 1):
            print("Run {} Period {}".format(r, t))

            # Split data into training and testing sets
            # Training set is the first 200 + t observations
            # Testing set is the last 50 - t observations
            idx_split = 200 + t
            
            X_train = X[:idx_split]
            y_train = y_h[:idx_split]
            y_test = y_t[idx_split]

            if method == "sPCA":
                # Estimate the parameters of the model using sPCAest
                # The function sPCAest is defined in functions.py
                factors, _ = sPCAest(target=y_train, X=X_train, nfac=nfac)
            elif method == "PCA":
                pca = PCA(n_components=nfac)
                X_normalized = StandardScaler(with_mean=True, with_std=True).fit_transform(X_train)
                pca.fit(X_normalized)
                factors = pca.transform(X_normalized)
                
            reg = LinearRegression().fit(factors[:-h_steps,:], y_train[:-h_steps])

            # Predict y_{t+h} using the estimated parameters
            y_pred = reg.predict(factors[-1].reshape(1, -1))
            
            # Compute the error
            error[t] = y_pred - y_test
            
        print("MSE of run {} is {}".format(r, np.mean(error**2)))
        error_mat[r,:] = error.flatten()

    return error_mat

In [80]:
# Run the simulation
errors = simulate_PCA(n=10, T=250, N=500, h_steps=1, R=10, nfac=2, method="PCA")

Run 0 Period 0
Run 0 Period 1
Run 0 Period 2
Run 0 Period 3
Run 0 Period 4
Run 0 Period 5
Run 0 Period 6
Run 0 Period 7
Run 0 Period 8
Run 0 Period 9
Run 0 Period 10
Run 0 Period 11
Run 0 Period 12
Run 0 Period 13
Run 0 Period 14
Run 0 Period 15
Run 0 Period 16
Run 0 Period 17
Run 0 Period 18
Run 0 Period 19
Run 0 Period 20
Run 0 Period 21
Run 0 Period 22
Run 0 Period 23
Run 0 Period 24
Run 0 Period 25
Run 0 Period 26
Run 0 Period 27
Run 0 Period 28
Run 0 Period 29
Run 0 Period 30
Run 0 Period 31
Run 0 Period 32
Run 0 Period 33
Run 0 Period 34
Run 0 Period 35
Run 0 Period 36
Run 0 Period 37
Run 0 Period 38
Run 0 Period 39
Run 0 Period 40
Run 0 Period 41
Run 0 Period 42
Run 0 Period 43
Run 0 Period 44
Run 0 Period 45
Run 0 Period 46
Run 0 Period 47
MSE of run 0 is 1.4040420672424636
Run 1 Period 0
Run 1 Period 1
Run 1 Period 2
Run 1 Period 3
Run 1 Period 4
Run 1 Period 5
Run 1 Period 6
Run 1 Period 7
Run 1 Period 8
Run 1 Period 9
Run 1 Period 10
Run 1 Period 11
Run 1 Period 12
Run 1 Per

In [82]:
mse_vec = (errors**2).mean(axis=1)
median_mse = np.median(mse_vec)
mean_mse = mse_vec.mean()
print("Median MSE is {}".format(median_mse))
print("Mean MSE is {}".format(mean_mse))
print("MSE is {}".format(mse_vec))

Median MSE is 1.5065399397136112
Mean MSE is 1.5722949167209856
MSE is [1.40404207 1.54178167 1.51190038 1.76377905 1.43468898 1.4095839
 1.59747451 1.3157444  2.24277473 1.5011795 ]
