### Alternative validation schemes

In [1]:
import os
os.getcwd()
# os.chdir(path)    # or you can set your working dir.

'/Users/xingfuxu/PycharmProjects/EquityPremiumPredictionML-Jupyter'

In [2]:
# Your working dir should include "NN_models.py", Perform_CW_test.py" and "Perform_PT_test.py" files.
from Perform_CW_test import CW_test
from Perform_PT_test import PT_test
from NN_models import Net2, Net4

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import ParameterGrid
import torch
from skorch import NeuralNetRegressor
from tqdm import tqdm
#
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

In [4]:
# set seed
torch.manual_seed(1)
np.random.seed(1)

# read data
predictor_df = pd.read_excel(open('ml_equity_premium_data.xlsx', 'rb'), sheet_name='result_predictor')
predictor_df.head()

Unnamed: 0,month,log_equity_premium,equity_premium,DP,DY,EP,SVAR,BM,NTIS,TBL,...,MA_2_9,MA_2_12,MA_3_9,MA_3_12,MOM_1,MOM_2,MOM_3,MOM_6,MOM_9,MOM_12
0,192701,-0.00571,-0.00571,-2.942374,-2.963349,-2.374773,0.00047,0.44371,0.05082,3.23,...,1,1,1,1,0,0,1,1,1,1
1,192702,0.042017,0.04302,-2.979535,-2.932946,-2.430353,0.00029,0.4285,0.05167,3.29,...,1,1,1,1,1,1,1,1,1,1
2,192703,0.004697,0.00472,-2.976535,-2.970053,-2.445079,0.00092,0.46977,0.04636,3.2,...,1,1,1,1,1,1,1,1,1,1
3,192704,0.00994,0.01002,-2.984225,-2.967143,-2.471309,0.0006,0.45675,0.05051,3.39,...,1,1,1,1,1,1,1,1,1,1
4,192705,0.057987,0.05985,-3.025963,-2.975058,-2.531446,0.00039,0.43478,0.05528,3.33,...,1,1,1,1,1,1,1,1,1,1


In [5]:
# remove irrelavent columns
predictor0 = predictor_df.drop(['month', 'equity_premium'], axis=1)
# get all the predictors and set the log equity premium 1-month ahead
predictor = np.concatenate([predictor0['log_equity_premium'][1:].values.reshape(-1, 1),
                            predictor0.iloc[0:(predictor0.shape[0] - 1), 1:]], axis=1)

# number of rows
N = predictor.shape[0]

# number of all columns, including the log equity premium
n_cols = predictor.shape[1]

# Actual one-month ahead log equity premium
actual = predictor[:, [0]]

# Historical average forecasting as benchmark
y_pred_HA = predictor0['log_equity_premium'].values[0:(predictor0.shape[0] - 1), ].cumsum() / np.arange(1, N + 1)
y_pred_HA = y_pred_HA.reshape(-1, 1)

In [6]:
## Out-of-sample: 1957:01-2020:12
in_out_1957 = predictor_df.index[predictor_df['month'] == 195701][0]
actual_1957 = actual[in_out_1957:, ]
y_pred_HA_1957 = y_pred_HA[in_out_1957:, ]
MSFE_HA_1957 = mean_squared_error(y_pred_HA_1957, actual_1957)

# Machine Learning methods used in GKX (2020)
y_pred_PLS_1957, y_pred_PCR_1957,  y_pred_LASSO_1957 = [], [], []
y_pred_ENet_1957, y_pred_RF_1957 = [], []
y_pred_NN2_1957, y_pred_NN4_1957 = [], []

## Other commonly used machine learning method
y_pred_Ridge_1957 = []

In [7]:
# control the update month of models during out-of-sample period. 
month_index = 1  # We update our models annually, meaning we refresh them in months 1, 13, 25, ...

In [8]:
for t in tqdm(range(in_out_1957, N)):
    #
    X_train_all = predictor[:t, 1:n_cols]
    y_train_all = predictor[:t, 0]

    #
    if month_index % 12 == 1:
        month_index += 1

        # PLS
        PLS = PLSRegression()
        PLS_param = {'n_components': [1, 2, 3, 4, 5, 6, 7, 8]}
        PLS_multi = GridSearchCV(estimator=PLS, param_grid=PLS_param, scoring='neg_mean_squared_error', cv=5)
        PLS_multi.fit(X_train_all, y_train_all)
        PLS_best_param = PLS_multi.best_params_
        PLS_model = PLSRegression(**PLS_best_param)
        PLS_model.fit(X_train_all, y_train_all)
        y_pred_PLS_1957.append(PLS_model.predict(predictor[[t], 1:n_cols])[0][0])

        # PCR
        PCR_param = [1, 2, 3, 4, 5, 6, 7, 8]
        PCR_result = {}
        pca = PCA()
        pca.fit(X_train_all)
        X_train_all_comps = pca.transform(X_train_all)
        cv5 = KFold(n_splits=5)
        pca_init = LinearRegression()

        for k in PCR_param:
            forecast = LinearRegression()
            mse = -1 * cross_val_score(forecast, X_train_all_comps[:, :k], y_train_all, cv=cv5,
                                       scoring='neg_mean_squared_error').mean()
            PCR_result[str(k)] = mse


        PCR_best_param = eval(min(PCR_result, key=PCR_result.get))
        PCR_model = PCA(n_components=PCR_best_param)
        PCR_model.fit(X_train_all)
        PCR_comps = PCR_model.transform(X_train_all)
        PCR_forecast = LinearRegression()
        PCR_forecast.fit(PCR_comps, y_train_all)
        y_pred_PCR_1957.append(PCR_forecast.predict(PCR_model.transform(predictor[[t], 1:n_cols]))[0])

        # LASSO
        LASSO = Lasso()
        LASSO_param = {'alpha': list(10 ** np.arange(-4, 1 + 0.001, 0.2))}
        LASSO_multi = GridSearchCV(estimator=LASSO, param_grid=LASSO_param, scoring='neg_mean_squared_error', cv=5)
        LASSO_multi.fit(X_train_all, y_train_all)
        LASSO_best_param = LASSO_multi.best_params_
        LASSO_model = Lasso(**LASSO_best_param)
        LASSO_model.fit(X_train_all, y_train_all)
        y_pred_LASSO_1957.append(LASSO_model.predict(predictor[[t], 1:n_cols])[0])


        # ENet
        ENet = ElasticNet()
        ENet_param = {'alpha': list(10 ** np.arange(-4, 1 + 0.001, 0.2)),
                      'l1_ratio': list(np.arange(0.2, 1, 0.3))}
        ENet_multi = GridSearchCV(estimator=ENet, param_grid=ENet_param, scoring='neg_mean_squared_error', cv=5)
        ENet_multi.fit(X_train_all, y_train_all)
        ENet_best_param = ENet_multi.best_params_
        ENet_model = ElasticNet(**ENet_best_param)
        ENet_model.fit(X_train_all, y_train_all)
        y_pred_ENet_1957.append(ENet_model.predict(predictor[[t], 1:n_cols])[0])

        # RF
        RF = RandomForestRegressor()
        RF_param = {'n_estimators': [10, 50, 100, 150, 200],
                    'max_depth': [2, 3, 4],
                    'min_samples_leaf': [1, 3, 5]}
        RF_multi = GridSearchCV(estimator=RF, param_grid=RF_param, scoring='neg_mean_squared_error', cv=5)
        RF_multi.fit(X_train_all, y_train_all)
        RF_best_param = RF_multi.best_params_
        RF_model = RandomForestRegressor(**RF_best_param)
        RF_model.fit(X_train_all, y_train_all)
        y_pred_RF_1957.append(RF_model.predict(predictor[[t], 1:n_cols])[0])

        # Neural Network Models: NN2 & NN4
        X_train_all_tensor = torch.tensor(X_train_all, dtype=torch.float)
        y_train_all_tensor = torch.tensor(y_train_all.reshape(-1, 1), dtype=torch.float)


        # NN2
        NN2_architecture = {"module__n_feature": X_train_all_tensor.shape[1],
                            "module__n_hidden1": 32, "module__n_hidden2": 16,
                            "module__n_output": 1}
        NN2 = NeuralNetRegressor(Net2, verbose=0, max_epochs=200,
                                 optimizer=torch.optim.SGD, **NN2_architecture)
        NN2_param = {'module__dropout': [0.2, 0.4, 0.6, 0.8],
                     'lr': [0.001, 0.01],
                     'optimizer__weight_decay': [0.1, 0.01, 0.001]}
        NN2_multi = GridSearchCV(estimator=NN2, param_grid=NN2_param, scoring='neg_mean_squared_error', cv=5)
        NN2_multi.fit(X_train_all_tensor, y_train_all_tensor)
        NN2_best_param = NN2_multi.best_params_
        NN2_model = NeuralNetRegressor(Net2, verbose=0, max_epochs=200, optimizer=torch.optim.SGD,
                                       **NN2_architecture, **NN2_best_param)
        NN2_model.fit(X_train_all_tensor, y_train_all_tensor)
        y_pred_NN2_1957.append(NN2_model.predict(torch.tensor(predictor[[t], 1:n_cols], dtype=torch.float))[0][0])
        #

        # NN4
        NN4_architecture = {"module__n_feature": X_train_all_tensor.shape[1],
                            "module__n_hidden1": 32, "module__n_hidden2": 16,
                            "module__n_hidden3": 8, "module__n_hidden4": 4,
                            "module__n_output": 1}
        NN4 = NeuralNetRegressor(Net4, verbose=0, max_epochs=200, optimizer=torch.optim.SGD, **NN4_architecture)
        NN4_param = {'module__dropout': [0.2, 0.4, 0.6, 0.8],
                     'lr': [0.001, 0.01],
                     'optimizer__weight_decay': [0.1, 0.01, 0.001]}
        NN4_multi = GridSearchCV(estimator=NN4, param_grid=NN4_param, scoring='neg_mean_squared_error', cv=5)
        NN4_multi.fit(X_train_all_tensor, y_train_all_tensor)
        NN4_best_param = NN4_multi.best_params_
        NN4_model = NeuralNetRegressor(Net4, verbose=0, max_epochs=200, optimizer=torch.optim.SGD,
                                       **NN4_architecture, **NN4_best_param)
        NN4_model.fit(X_train_all_tensor, y_train_all_tensor)
        y_pred_NN4_1957.append(NN4_model.predict(torch.tensor(predictor[[t], 1:n_cols], dtype=torch.float))[0][0])

        ## Other commmonly used ML methods
        # Ridge
        RIDGE = Ridge()
        Ridge_param = {'alpha': list(10 ** np.arange(0, 20 + 0.001, 1))}
        RIDGE_multi = GridSearchCV(estimator=RIDGE, param_grid=Ridge_param, scoring='neg_mean_squared_error', cv=5)
        RIDGE_multi.fit(X_train_all, y_train_all)
        Ridge_best_param = RIDGE_multi.best_params_
        Ridge_model = Ridge(**Ridge_best_param)
        Ridge_model.fit(X_train_all, y_train_all)
        y_pred_Ridge_1957.append(Ridge_model.predict(predictor[[t], 1:n_cols])[0])

    else:
        month_index += 1
        y_pred_PLS_1957.append(PLS_model.predict(predictor[[t], 1:n_cols])[0][0])
        y_pred_PCR_1957.append(PCR_forecast.predict(PCR_model.transform(predictor[[t], 1:n_cols]))[0])
        y_pred_LASSO_1957.append(LASSO_model.predict(predictor[[t], 1:n_cols])[0])
        y_pred_ENet_1957.append(ENet_model.predict(predictor[[t], 1:n_cols])[0])
        y_pred_RF_1957.append(RF_model.predict(predictor[[t], 1:n_cols])[0])
        y_pred_NN2_1957.append(NN2_model.predict(torch.tensor(predictor[[t], 1:n_cols], dtype=torch.float))[0][0])
        y_pred_NN4_1957.append(NN4_model.predict(torch.tensor(predictor[[t], 1:n_cols], dtype=torch.float))[0][0])
        # Other commonly used ML methods
        y_pred_Ridge_1957.append(Ridge_model.predict(predictor[[t], 1:n_cols])[0])

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 767/767 [10:25:36<00:00, 48.94s/it]


In [9]:
y_ml_pred_1957 = pd.DataFrame(np.array([y_pred_PLS_1957, y_pred_PCR_1957, y_pred_LASSO_1957,
                                   y_pred_ENet_1957, y_pred_RF_1957, y_pred_NN2_1957,  
                                   y_pred_NN4_1957, y_pred_Ridge_1957]),
                         index=['PLS', 'PCR', 'LASSO', 'ENet', 
                                'RF', 'NN2', 'NN4', 'Ridge'],
                         columns=predictor_df.month[in_out_1957:N]).T

In [10]:
# Performance compared with HA benchmark

def compute_oos_r_square(actual, y_benchmark, y_pred):
    MSFE_benchmark = mean_squared_error(y_benchmark, actual)
    MSFE_pred = mean_squared_error(y_pred, actual)
    return 1 - MSFE_pred / MSFE_benchmark


ml_oos_performance_cv5 = []

for col in y_ml_pred_1957.columns:
    oos_r_square = compute_oos_r_square(actual_1957, y_pred_HA_1957, y_ml_pred_1957[[col]].to_numpy())
    MSFE_adjusted, pvalue_MSFE = CW_test(actual_1957, y_pred_HA_1957, y_ml_pred_1957[[col]].to_numpy())
    success_ratio, PT_stat, pvalue_PT = PT_test(actual_1957, y_ml_pred_1957[[col]].to_numpy())
    ml_oos_performance_cv5.append([oos_r_square * 100, MSFE_adjusted, pvalue_MSFE, success_ratio * 100, PT_stat, pvalue_PT])


ml_oos_performance_df_cv5 = pd.DataFrame(np.array(ml_oos_performance_cv5),
                                          index=y_ml_pred_1957.columns,
                                          columns=['oos_r_square', 'MSFE_adjusted', 'pvalue_MSFE',
                                                   'success_ratio', 'PT_stat', 'pvalue_PT'])
# success ratio of HA
success_ratio_HA_1957, PT_HA_1957, p2_HA_1957 = PT_test(actual_1957, y_pred_HA_1957)
ml_oos_performance_df_cv5.loc['HA'] = [0, np.nan, np.nan, success_ratio_HA_1957 * 100, PT_HA_1957, p2_HA_1957]

  stat = (p_hat - p_star) / np.sqrt(p_hat_var - p_star_var)


In [11]:
# save
import openpyxl
with pd.ExcelWriter("ml_equity_premium_robust_checks.xlsx", engine='openpyxl', mode='a') as writer:
    ml_oos_performance_df_cv5.to_excel(writer, sheet_name='Alternative_validation_schemes')
ml_oos_performance_df_cv5

Unnamed: 0,oos_r_square,MSFE_adjusted,pvalue_MSFE,success_ratio,PT_stat,pvalue_PT
PLS,-1.04879,1.887934,0.029517,59.322034,3.206005,0.000673
PCR,-2.180044,0.275585,0.391433,57.366362,1.614489,0.053211
LASSO,0.196495,1.137284,0.12771,59.452412,1.859792,0.031458
ENet,-0.172823,0.943372,0.172745,58.539765,1.200649,0.114944
RF,-7.535413,-0.807213,0.790228,57.105606,-1.16157,0.877295
NN2,-0.475091,0.515088,0.303246,59.322034,-0.402961,0.656512
NN4,-0.978817,-0.292618,0.615093,59.191656,0.819584,0.206227
Ridge,-0.409972,0.814798,0.207594,58.9309,1.705101,0.044088
HA,0.0,,,59.973924,,
