In [241]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

from functions import winsor, sPCAest
from scipy.ndimage import shift
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [299]:
def generate_data(n, T, N, h_steps=1):
    # Generate y_{t + h} = g_t + e_{t + h}
    # e_{t + h} ~ N(0, 1)
    # g_t ~ N(0, 1)
    # X_t,i = g_t*phi_i + h_t*psi_i + u_t,i

    # Set T to T + h to account for lag
    T_plus_h = T + h_steps

    # Generate g_t
    g = np.random.normal(0, 1, T_plus_h)

    # Generate h_t
    h = np.random.normal(0, 1, T_plus_h)

    # Generate e_t+h
    e = np.random.normal(0, 1, T_plus_h)

    # Generate y_{t+h}
    y_h = g + e

    # Generate y_t
    y_t = np.zeros(T_plus_h)
    for i in range(h_steps, T_plus_h):
        y_t[i] = y_h[i - h_steps]

    # Generate phi_i (Nx1) with n < N nonzero elements
    phi = np.zeros(N)
    phi[:n] = np.random.uniform(0, 1, n)

    # Generate psi_i (Nx1) with n < N nonzero elements
    psi = np.zeros(N)
    psi[:n] = np.random.uniform(0, 1, n)

    # Generate idiosyncratic error's variances
    variance_u = np.random.uniform(0, 1, N)

    # Generate covariance matrix for u_t
    sigma_u = np.diag(variance_u)

    # Generate u
    u = np.random.multivariate_normal(np.zeros(N), sigma_u, T)

    # Drop first h rows
    y_h = y_h[h_steps:]
    y_t = y_t[h_steps:]
    e = e[h_steps:]
    g = g[h_steps:]
    h = h[h_steps:]

    

    # Generate X_t
    X = np.zeros((T, N))
    for i in range(N):
        X[:, i] = g*phi[i] + h*psi[i] + u[:, i]

    results = {'y_t':y_t,
               'y_h':y_h,
                'X':X,
                'g':g,
                'e':e,
                }
    
    return results

# Testing
results = generate_data(10, T=250, N=500, h_steps=1)
y_t = results['y_t']
y_h = results['y_h']
X = results['X']
e = results['e']
g = results['g']

print(y_t.shape)
print(y_h.shape)
print(X.shape)

print('y_t: ', y_t[:10])
print('y_h: ', y_h[:10])
print()
print('e: ', e[:10])
print('g: ', g[:10])


(250,)
(250,)
(250, 500)
y_t:  [-1.51274368 -1.01074068  1.37232345  1.07421144  2.01828909  0.30852996
  0.46436283 -0.64358596 -1.69906167  1.74191539]
y_h:  [-1.01074068  1.37232345  1.07421144  2.01828909  0.30852996  0.46436283
 -0.64358596 -1.69906167  1.74191539 -2.00727577]

e:  [-1.43182647  0.7079561   0.42051982  1.15020388  0.15191228 -1.52350218
 -0.78313068 -2.3888026  -0.85077465 -1.75814383]
g:  [ 0.4210858   0.66436734  0.65369162  0.86808521  0.15661768  1.98786502
  0.13954472  0.68974093  2.59269005 -0.24913194]


In [305]:
def simulate_PCA(n=10, T=250, N=500, h_steps=1, R=10, nfac=2):
    # Initialize the MSE vector
    error_mat = np.zeros((R, 50 - h_steps - 1))

    for r in range(R):
        print("Run {}".format(r))
        variables = generate_data(n=n, T=T, h_steps=h_steps, N=N)

        # y_t contains the values of y at time t, and y_h contains the values of y at time t+h
        # X contains the values of X at time t
        y_h = variables['y_h']
        y_t = variables['y_t']
        X = variables['X']

        g = variables['g']

        # Loop for expanding window estimation
        # The initial window size is 200 observations
        # The window size is increased by 1 observation at each iteration
        # The window size is increased until it reaches 250 observations

        # Initialize the error vector
        error = np.zeros((50 - h_steps - 1, 1))

        for t in range(50 - h_steps - 1):
            # Split data into training and testing sets
            # Training set is the first 200 + t observations
            # Testing set is the last 50 - t observations
            idx_split = 200 + t

            y_train = y_t[:idx_split]
            y_test = y_t[idx_split]

            X_train = X[:idx_split]

            y_train_h = y_h[:idx_split]
            # Estimate the parameters of the model using sPCAest
            # The function sPCAest is defined in functions.py
            #factors, _ = sPCAest(target=y_train_h, X=X_train_h, nfac=nfac)
            pca = PCA(n_components=nfac)
            X_normalized = StandardScaler(with_mean=True, with_std=True).fit_transform(X_train)
            pca.fit(X_normalized)
            factors = pca.transform(X_normalized)
                
            reg = LinearRegression().fit(factors, y_train_h)

            # Predict y_{t+h} using the estimated parameters
            y_pred = reg.predict(factors[-1].reshape(1, -1))

            # Compute the error
            error[t] = y_pred - y_test
            
        print("MSE of run {} is {}".format(r, np.mean(error**2)))
        error_mat[r,:] = error.flatten()

    return error_mat

In [306]:
# Run the simulation
errors = simulate_PCA(n=50, T=250, N=500, h_steps=1, R=10, nfac=2)

Run 0
MSE of run 0 is 1.0769235762828442
Run 1
MSE of run 1 is 0.9949275978377182
Run 2
MSE of run 2 is 1.0507670075972764
Run 3
MSE of run 3 is 1.2235095263527922
Run 4
MSE of run 4 is 1.1199560712221495
Run 5
MSE of run 5 is 1.9306840972537669
Run 6
MSE of run 6 is 1.8204256328951633
Run 7
MSE of run 7 is 1.5634360837626557
Run 8
MSE of run 8 is 1.0656478588151697
Run 9
MSE of run 9 is 1.1336294218555947


In [307]:
mse_vec = (errors**2).mean(axis=1)
median_mse = np.median(mse_vec)
print("Median MSE is {}".format(median_mse))
print("MSE is {}".format(mse_vec))

Median MSE is 1.126792746538872
MSE is [1.07692358 0.9949276  1.05076701 1.22350953 1.11995607 1.9306841
 1.82042563 1.56343608 1.06564786 1.13362942]
