In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

from functions import winsor, sPCAest, corr_uniform
from scipy.ndimage import shift
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Weak factor case

In [None]:
def generate_data(n, T, N, h_steps=1, heteroskedastic=False, psi_max=None, rho=None):
    # Check if weak or strong factor model
    if n == N:
        # Strong factor model, require psi_max and rho
        assert psi_max is not None
        assert rho is not None

    # Generate y_{t + h} = g_t + e_{t + h}
    # e_{t + h} ~ N(0, 1)
    # g_t ~ N(0, 1)
    # X_t,i = g_t*phi_i + h_t*psi_i + u_t,i

    # Set T to T + h to account for lag
    T_plus_h = T + h_steps

    # Generate g_t
    g = np.random.normal(0, 1, T_plus_h)

    # Generate h_t
    h = np.random.normal(0, 1, T_plus_h)

    # Generate e_t+h
    e = np.random.normal(0, 1, T_plus_h)

    # Generate y_{t+h}
    y_h = g + e

    # Generate y_t
    y_t = np.zeros(T_plus_h)
    for i in range(h_steps, T_plus_h):
        y_t[i] = y_h[i - h_steps]

    
    phi = np.zeros(N)
    psi = np.zeros(N)

    if n < N:
        # Generate phi_i (Nx1) with n < N nonzero elements
        phi[:n] = np.random.uniform(0, 1, n)

        # Generate psi_i (Nx1) with n < N nonzero elements
        psi[:n] = np.random.uniform(0, 1, n)

        # Generate idiosyncratic error's variances
        variance_u = np.random.uniform(0, 1, N)        

    elif n == N:
        # Phi is U(0, phi_max)
        psi = np.random.uniform(0, psi_max, N)

        # Draw correlated phi and sigma_u
        # Generate correlation matrix for phi and sigma_u
        phi, variance_u = corr_uniform(rho=rho*1.1, size=N)

    u = np.zeros((T, N))
    # Generate u_t,i depending on whether the model is heteroskedastic or not
    if heteroskedastic:
        for t in range(T):
            scale = np.random.uniform(0.5, 1.5)
            u[t, :] = np.random.normal(0, variance_u * scale, N)
    else:
        # Generate covariance matrix for u_t
        sigma_u = np.diag(variance_u)

        # Generate u
        u = np.random.multivariate_normal(np.zeros(N), sigma_u, T)

    # Drop first h rows
    y_h = y_h[h_steps:]
    y_t = y_t[h_steps:]
    e = e[h_steps:]
    g = g[h_steps:]
    h = h[h_steps:]

    # Generate X_t
    X = np.zeros((T, N))
    for i in range(N):
        X[:, i] = g*phi[i] + h*psi[i] + u[:, i]

    results = {'y_t':y_t,
               'y_h':y_h,
                'X':X,
                'g':g,
                'e':e,
                }
    
    return results

# Testing
results = generate_data(10, T=250, N=500, h_steps=1)
y_t = results['y_t']
y_h = results['y_h']
X = results['X']
e = results['e']
g = results['g']

print(y_t.shape)
print(y_h.shape)
print(X.shape)

print('y_t: ', y_t[:10])
print('y_h: ', y_h[:10])
print()
print('e: ', e[:10])
print('g: ', g[:10])


In [None]:
def simulate_PCA(n=10, T=250, N=500, h_steps=1, R=10, nfac=2, method="sPCA", heteroskedastic=False, psi_max=None, rho=None):
    # Initialize the MSE vector
    error_mat = np.zeros((R, 50 - h_steps - 1))

    for r in range(R):
        variables = generate_data(n=n, T=T, h_steps=h_steps, N=N, heteroskedastic=heteroskedastic, psi_max=psi_max, rho=rho)

        # y_t contains the values of y at time t, and y_h contains the values of y at time t+h
        # X contains the values of X at time t
        y_h = variables['y_h']
        y_t = variables['y_t']
        X = variables['X']

        # Initialize the error vector
        error = np.zeros((50 - h_steps - 1, 1))
        
        # Loop for expanding window estimation
        # The initial window size is 200 observations
        # The window size is increased by 1 observation at each iteration
        # The window size is increased until it reaches 250 observations

        for t in range(50 - h_steps - 1):
            if t % 10 == 0:
                print("Run {} Period {}".format(r, t))

            # Split data into training and testing sets
            # Training set is the first 200 + t observations
            # Testing set is the last 50 - t observations
            idx_split = 200 + t
            
            X_train = X[:idx_split]
            y_train = y_h[:idx_split]
            y_test = y_t[idx_split]

            if method == "sPCA":
                # Estimate the parameters of the model using sPCAest
                # The function sPCAest is defined in functions.py
                factors, _ = sPCAest(target=y_train, X=X_train, nfac=nfac)
            elif method == "PCA":
                pca = PCA(n_components=nfac)
                X_normalized = StandardScaler(with_mean=True, with_std=True).fit_transform(X_train)
                pca.fit(X_normalized)
                factors = pca.transform(X_normalized)
                
            reg = LinearRegression().fit(factors[:-h_steps,:], y_train[:-h_steps])

            # Predict y_{t+h} using the estimated parameters
            y_pred = reg.predict(factors[-1].reshape(1, -1))
            
            # Compute the error
            error[t] = y_pred - y_test
            
        print("MSE of run {} is {}".format(r, np.mean(error**2)))
        error_mat[r,:] = error.flatten()

    return error_mat

In [42]:
# Run the simulation
errors = simulate_PCA(n=100, T=250, N=100, h_steps=1, R=100, nfac=2, method="sPCA", heteroskedastic=False, rho=0.75, psi_max=0.5)

Run 0 Period 0
Run 0 Period 10
Run 0 Period 20
Run 0 Period 30
Run 0 Period 40
MSE of run 0 is 0.6709633413573367
Run 1 Period 0
Run 1 Period 10
Run 1 Period 20
Run 1 Period 30
Run 1 Period 40
MSE of run 1 is 1.046301649182494
Run 2 Period 0
Run 2 Period 10
Run 2 Period 20
Run 2 Period 30
Run 2 Period 40
MSE of run 2 is 1.1592177103875396
Run 3 Period 0
Run 3 Period 10
Run 3 Period 20
Run 3 Period 30
Run 3 Period 40
MSE of run 3 is 1.3127338687787418
Run 4 Period 0
Run 4 Period 10
Run 4 Period 20
Run 4 Period 30
Run 4 Period 40
MSE of run 4 is 0.8809837129310637
Run 5 Period 0
Run 5 Period 10
Run 5 Period 20
Run 5 Period 30
Run 5 Period 40
MSE of run 5 is 0.7979332783669587
Run 6 Period 0
Run 6 Period 10
Run 6 Period 20
Run 6 Period 30
Run 6 Period 40
MSE of run 6 is 1.0861859030661527
Run 7 Period 0
Run 7 Period 10
Run 7 Period 20
Run 7 Period 30
Run 7 Period 40
MSE of run 7 is 1.1312210397504288
Run 8 Period 0
Run 8 Period 10
Run 8 Period 20
Run 8 Period 30
Run 8 Period 40
MSE of run

In [43]:
mse_vec = (errors**2).mean(axis=1)
median_mse = np.median(mse_vec)
mean_mse = mse_vec.mean()
print("Median MSE is {}".format(median_mse))
print("Mean MSE is {}".format(mean_mse))
print("MSE is {}".format(mse_vec))

Median MSE is 1.0431857875254658
Mean MSE is 1.0459105254651706
MSE is [0.67096334 1.04630165 1.15921771 1.31273387 0.88098371 0.79793328
 1.0861859  1.13122104 0.83025976 1.03880938 0.93811049 0.76833832
 0.90734023 1.07392533 1.09931911 0.77118362 0.92444606 1.25014224
 0.78483536 1.01929489 0.81226603 0.64809377 1.10860066 1.3690869
 1.11877923 1.20546851 0.96036093 1.21386409 1.1986419  1.03599556
 1.2339176  1.30786021 1.19835291 0.9753385  1.05507284 0.84020745
 1.65637381 1.33553391 0.82187501 0.91252651 1.53501439 0.66304062
 1.19246864 0.9276232  1.00022335 1.20131229 0.82859251 1.09640533
 1.15426597 0.99380607 1.16335895 0.86993052 1.04646871 0.83447452
 0.97051141 1.01056358 1.21949755 0.90248389 1.2820901  0.72748661
 0.80290392 0.91519401 0.84168758 0.92026233 0.82576136 1.18876983
 1.04117417 1.013338   0.9125243  0.87921363 1.1885179  0.9302438
 1.27435126 1.50434979 0.82762053 1.19982013 0.89996094 1.07972799
 1.0086908  0.96058533 1.12449925 0.85570976 1.09774469 1.24