In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

from src.helpers.functions import winsor, corr_uniform, sPCAest
from scipy.ndimage import shift
import sklearn
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA, KernelPCA, SparsePCA, FastICA
from sklearn.preprocessing import StandardScaler




# Weak factor case

In [3]:
def generate_data(n, T, N, h_steps=1, heteroskedastic=False, psi_max=None, rho=None):
    # Check if weak or strong factor model
    if n == N:
        # Strong factor model, require psi_max and rho
        assert psi_max is not None
        assert rho is not None

    # Generate y_{t + h} = g_t + e_{t + h}
    # e_{t + h} ~ N(0, 1)
    # g_t ~ N(0, 1)
    # X_t,i = g_t*phi_i + h_t*psi_i + u_t,i

    # Set T to T + h to account for lag
    T_plus_h = T + h_steps

    # Generate g_t
    g = np.zeros(T_plus_h)
    g = np.random.normal(0, 1, T_plus_h)

    # Generate h_t
    h = np.zeros(T_plus_h)
    h = np.random.normal(0, 1, T_plus_h)
    
    # Generate e_t+h
    e = np.zeros(T_plus_h)
    e = np.random.normal(0, 1, T_plus_h)

    # Generate y_{t+h}
    y_h = np.zeros(T_plus_h)
    y_h = g + e

    # Generate y_t
    y_t = np.zeros(T_plus_h)
    y_t[h_steps:T_plus_h] = y_h[:T]

    phi = np.zeros(N)
    psi = np.zeros(N)

    if n < N:
        # Generate phi_i (Nx1) with n < N nonzero elements
        phi[:n] = np.random.uniform(0, 1, n)

        # Generate psi_i (Nx1) with n < N nonzero elements
        psi[:n] = np.random.uniform(0, 1, n)

        # Generate idiosyncratic error's variances
        variance_u = np.random.uniform(0, 1, N)        
    elif n == N:
        # Phi is U(0, phi_max)
        psi = np.random.uniform(0, psi_max, N)

        # Draw correlated phi and sigma_u
        # Generate correlation matrix for phi and sigma_u
        phi, variance_u = corr_uniform(rho=rho*1.1, size=N)

    u = np.zeros((T, N))
    # Generate u_t,i depending on whether the model is heteroskedastic or not
    if heteroskedastic:
        for t in range(T):
            scale = np.random.uniform(0.5, 1.5)
            u[t, :] = np.random.normal(0, variance_u * scale, N)
    else:
        # Generate u
        u = np.random.multivariate_normal(mean=np.zeros(N), cov=np.diag(variance_u), size=T)

    # Drop first h rows
    y_h = y_h[h_steps:]
    y_t = y_t[h_steps:]
    e = e[h_steps:]
    g = g[h_steps:]
    h = h[h_steps:]

    X = g[:, np.newaxis]*phi[np.newaxis, :] + h[:, np.newaxis]*psi[np.newaxis, :] + u

    results = {'y_t':y_t,
               'y_h':y_h,
                'X':X,
                'g':g,
                'e':e,
                }
    
    return results

# Testing
results = generate_data(10, T=250, N=500, h_steps=1)
y_t = results['y_t']
y_h = results['y_h']
X = results['X']
e = results['e']
g = results['g']

print(y_t.shape)
print(y_h.shape)
print(X.shape)

print('y_t: ', y_t[:10])
print('y_h: ', y_h[:10])
print()
print('e: ', e[:10])
print('g: ', g[:10])


(250,)
(250,)
(250, 500)
y_t:  [ 2.04913679 -1.64999425 -1.30016159  0.83831353 -4.52415919  1.20956829
 -0.8243528   0.77193026 -2.38909931  0.33492328]
y_h:  [-1.64999425 -1.30016159  0.83831353 -4.52415919  1.20956829 -0.8243528
  0.77193026 -2.38909931  0.33492328  1.20228037]

e:  [-1.1674389  -0.50939583 -0.02538958 -1.4283239  -0.57223265 -1.14476396
  0.52698544 -1.13888465 -0.46207751  1.87028361]
g:  [-0.48255535 -0.79076575  0.86370311 -3.09583529  1.78180094  0.32041116
  0.24494482 -1.25021467  0.79700079 -0.66800324]


In [4]:
def predict_pca(X, y_h, y_t, h_steps=1, nfac=2, method="sPCA", start_test=200):
    pca = PCA(n_components=nfac)
    reg = LinearRegression()

    predicted = np.zeros((1, 50 - h_steps - 1))
    error_mat = np.zeros((1, 50 - h_steps - 1))
    normalize = StandardScaler(with_mean=True, with_std=True)

    for t in range(50 - h_steps - 1):
        # Split data into training and testing sets
        # Training set is the first 200 + t observations
        # Testing set is the last 50 - t observations
        idx_split = start_test + t
        
        X_train = X[:idx_split]
        y_train = y_t[:idx_split]
        y_test = y_t[idx_split]

        if method == "sPCA":
            # Estimate the parameters of the model using sPCAest
            # The function sPCAest is defined in functions.py
            factors, _ = sPCAest(y_train, X_train, nfac,[0, 100], h_steps)
        elif method == "PCA":
            X_normalized = normalize.fit_transform(X_train)
            factors = pca.fit_transform(X_normalized)

        reg.fit(factors[:-h_steps,:], y_train[h_steps:])
        
        # Predict y_{t+h} using the estimated parameters
        y_pred = reg.predict(factors[-1].reshape(1, -1))

        # Add predicted value to the predicted vector
        predicted[0, t] = y_pred
        
        # Compute the error
        error_mat[0, t] = y_pred - y_test

    return error_mat, predicted


def simulate_PCA(n=10, T=250, N=500, h_steps=1, R=10, nfac=2, method="sPCA", heteroskedastic=False, psi_max=None, rho=None):
    # Initialize the MSE vector
    predicted = np.zeros((R, 50 - h_steps - 1))
    error_mat = np.zeros((R, 50 - h_steps - 1))

    start_test = 200

    for r in tqdm(range(R)):
        variables = generate_data(n=n, T=T, h_steps=h_steps, N=N, heteroskedastic=heteroskedastic, psi_max=psi_max, rho=rho)

        # y_t contains the values of y at time t, and y_h contains the values of y at time t+h
        # X contains the values of X at time t
        y_h = variables['y_h']
        y_t = variables['y_t']
        X = variables['X']
        
        # Loop for expanding window estimation
        # The initial window size is 200 observations
        # The window size is increased by 1 observation at each iteration
        # The window size is increased until it reaches 250 observations
        errors, predictions = predict_pca(X, y_h, y_t, h_steps, nfac, method, start_test)
        error_mat[r, :] = errors
        predicted[r, :] = predictions
        
    return error_mat, predicted

In [5]:
import warnings
warnings.filterwarnings('ignore')

# Run the simulationc:\Users\Vincent\Anaconda3New\envs\thesis\lib\site-packages\sklearn\decomposition\_fastica.py:729

for nfac in [1, 2, 3]:
    for method in ["sPCA"]:
        for heterosked in [True, False]:
            for n in [10, 20, 30, 40, 50]:
                print("nfac: {}, method: {}, heterosked: {}, n: {}".format(nfac, method, heterosked, n))
                errors, predictions = simulate_PCA(n=n, T=250, N=500, h_steps=1, R=10, nfac=nfac, method=method, heteroskedastic=heterosked)

                # Save errors in npy file
                #np.save("resources/results/sim/newerrors_nfac_{}_method_{}_heterosked_{}_n_{}".format(nfac, method, heterosked, n), errors)
                #print median mse
                mse_vec = (errors**2).mean(axis=1)
                print(mse_vec)
                print("Median MSE: {}".format(np.median(mse_vec)))

#errors, predictions = simulate_PCA(n=50, T=250, N=500, h_steps=1, R=100, nfac=2, method="PCA", heteroskedastic=False)

nfac: 1, method: sPCA, heterosked: True, n: 10


100%|██████████| 10/10 [02:53<00:00, 17.33s/it]


[1.19301935 1.91638123 1.60194477 1.40395473 1.03002431 1.88759607
 0.89039235 1.29198268 1.59367375 1.51065004]
Median MSE: 1.4573023865088728
nfac: 1, method: sPCA, heterosked: True, n: 20


 20%|██        | 2/10 [00:39<02:39, 19.99s/it]


KeyboardInterrupt: 

In [15]:
mse_vec = (errors**2).mean(axis=1)
median_mse = np.median(mse_vec)
mean_mse = mse_vec.mean()
print("Median MSE is {}".format(median_mse))
print("Mean MSE is {}".format(mean_mse))
print("MSE is {}".format(mse_vec))

print(errors)

Median MSE is 1.2032784403170285
Mean MSE is 1.3454188785284196
MSE is [1.1533483  1.67962989 1.20327844]
[[-1.37698031e+00 -1.51825559e+00 -2.45048694e+00 -7.93658656e-02
   5.95462830e-01 -2.33476740e+00 -7.59228655e-01 -4.68956948e-01
   9.11057507e-01  5.46488803e-01  1.80725337e-01  1.29629386e+00
   2.11389521e-01  3.95513080e-01  2.95997324e-01 -1.49515763e+00
   1.76824381e+00  1.69391660e+00  1.10807763e+00  1.11321922e+00
   6.67677012e-02 -6.46077988e-01  8.01870294e-02  5.23966656e-01
   1.35655611e+00  6.63764057e-01 -5.32275170e-01  9.97696500e-02
  -5.45598588e-01  1.41257072e+00  2.94573106e-02 -3.51512564e-01
  -1.18330142e+00 -1.94427611e-01 -9.15249179e-01  5.55706284e-01
  -1.33517940e+00  8.87010748e-01 -1.37463230e+00 -2.17419647e+00
  -1.25740135e+00 -1.31318055e+00 -6.14154079e-01  2.22614888e-01
  -1.46933551e+00 -1.68493354e-01  1.13868634e+00  1.89460041e-01]
 [-1.43467485e+00 -4.55907479e-01  7.61920907e-01 -6.70436295e-01
   4.56099269e-01  9.78955865e-01 -

## Plotting

In [None]:
# Plot erros of the first run of the simulation
plt.plot((predictions[0,:]))
plt.title("Predicted values of the first run of the simulation")
# Include tick at every other integer
plt.xticks(np.arange(0, 50, 2))
plt.show()

# Plot the histogram of  mean squared error over the different runs
plt.hist(mse_vec, bins=40)
plt.title("Histogram of mean squared error over the different runs")
plt.show()
