### Using a robust scoring function on the validation set

In [1]:
import os
os.getcwd()
# os.chdir(path)    # or you can set your working dir.

'/Users/xingfuxu/PycharmProjects/EquityPremiumPredictionML-Jupyter'

In [2]:
# Your working dir should include "NN_models.py", Perform_CW_test.py" and "Perform_PT_test.py" files.
from Perform_CW_test import CW_test
from Perform_PT_test import PT_test
from NN_models import Net1, Net2, Net3, Net4, Net5

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import ParameterGrid
import torch
from skorch import NeuralNetRegressor
from tqdm import tqdm
#
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, median_absolute_error

In [4]:
# set seed
torch.manual_seed(1)
np.random.seed(1)

# read data
predictor_df = pd.read_excel(open('ml_equity_premium_data.xlsx', 'rb'), sheet_name='result_predictor')
predictor_df.head()

Unnamed: 0,month,log_equity_premium,equity_premium,DP,DY,EP,SVAR,BM,NTIS,TBL,...,MA_2_9,MA_2_12,MA_3_9,MA_3_12,MOM_1,MOM_2,MOM_3,MOM_6,MOM_9,MOM_12
0,192701,-0.00571,-0.00571,-2.942374,-2.963349,-2.374773,0.00047,0.44371,0.05082,3.23,...,1,1,1,1,0,0,1,1,1,1
1,192702,0.042017,0.04302,-2.979535,-2.932946,-2.430353,0.00029,0.4285,0.05167,3.29,...,1,1,1,1,1,1,1,1,1,1
2,192703,0.004697,0.00472,-2.976535,-2.970053,-2.445079,0.00092,0.46977,0.04636,3.2,...,1,1,1,1,1,1,1,1,1,1
3,192704,0.00994,0.01002,-2.984225,-2.967143,-2.471309,0.0006,0.45675,0.05051,3.39,...,1,1,1,1,1,1,1,1,1,1
4,192705,0.057987,0.05985,-3.025963,-2.975058,-2.531446,0.00039,0.43478,0.05528,3.33,...,1,1,1,1,1,1,1,1,1,1


In [5]:
# remove irrelavent columns
predictor0 = predictor_df.drop(['month', 'equity_premium'], axis=1)
# get all the predictors and set the log equity premium 1-month ahead
predictor = np.concatenate([predictor0['log_equity_premium'][1:].values.reshape(-1, 1),
                            predictor0.iloc[0:(predictor0.shape[0] - 1), 1:]], axis=1)

# number of rows
N = predictor.shape[0]

# number of all columns, including the log equity premium
n_cols = predictor.shape[1]

# Actual one-month ahead log equity premium
actual = predictor[:, [0]]

# Historical average forecasting as benchmark
y_pred_HA = predictor0['log_equity_premium'].values[0:(predictor0.shape[0] - 1), ].cumsum() / np.arange(1, N + 1)
y_pred_HA = y_pred_HA.reshape(-1, 1)

In [6]:
## Out-of-sample: 1957:01-2020:12
in_out_1957 = predictor_df.index[predictor_df['month'] == 195701][0]
actual_1957 = actual[in_out_1957:, ]
y_pred_HA_1957 = y_pred_HA[in_out_1957:, ]
MSFE_HA_1957 = mean_squared_error(y_pred_HA_1957, actual_1957)

# Machine Learning methods used in GKX (2020)
y_pred_OLS_1957, y_pred_PLS_1957, y_pred_PCR_1957,  y_pred_LASSO_1957 = [], [], [], []
y_pred_ENet_1957, y_pred_GBRT_1957, y_pred_RF_1957 = [], [], []
y_pred_NN1_1957, y_pred_NN2_1957, y_pred_NN3_1957, y_pred_NN4_1957, y_pred_NN5_1957 = [], [], [], [], []

## Other commonly used machine learning method
y_pred_Ridge_1957, y_pred_SVR_1957, y_pred_KNR_1957,  y_pred_XGBoost_1957 = [], [], [], []
y_pred_combination_1957 = []

In [7]:
# control the update month of models during out-of-sample period. 
month_index = 1  # We update our models annually, meaning we refresh them in months 1, 13, 25, ...

In [8]:
for t in tqdm(range(in_out_1957, N)):
    #
    X_train_all = predictor[:t, 1:n_cols]
    y_train_all = predictor[:t, 0]
    # set 15% of all the train data as validation set
    X_train = X_train_all[0:int(len(X_train_all) * 0.85), :]
    X_validation = X_train_all[int(len(X_train_all) * 0.85):t, :]
    y_train = y_train_all[0:int(len(X_train_all) * 0.85)]
    y_validation = y_train_all[int(len(X_train_all) * 0.85):t]
    #
    if month_index % 12 == 1:
        month_index += 1
        # OLS
        OLS = LinearRegression()
        OLS.fit(X_train_all, y_train_all)
        y_pred_OLS_1957.append(OLS.predict(predictor[[t], 1:n_cols])[0])

        # PLS
        PLS_param = {'n_components': [1, 2, 3, 4, 5, 6, 7, 8]}
        PLS_result = {}
        for param in ParameterGrid(PLS_param):
            PLS = PLSRegression(**param)
            PLS.fit(X_train, y_train)
            mse = median_absolute_error(PLS.predict(X_validation), y_validation)
            PLS_result[str(param)] = mse

        PLS_best_param = eval(min(PLS_result, key=PLS_result.get))
        PLS_model = PLSRegression(**PLS_best_param)
        PLS_model.fit(X_train_all, y_train_all)
        y_pred_PLS_1957.append(PLS_model.predict(predictor[[t], 1:n_cols])[0][0])

        # PCR
        PCR_param = {'n_components': [1, 2, 3, 4, 5, 6, 7, 8]}
        PCR_result = {}
        for param in ParameterGrid(PCR_param):
            pca = PCA(**param)
            pca.fit(X_train)
            comps = pca.transform(X_train)
            forecast = LinearRegression()
            forecast.fit(comps, y_train)
            mse = median_absolute_error(forecast.predict(pca.transform(X_validation)), y_validation)
            PCR_result[str(param)] = mse
        #
        PCR_best_param = eval(min(PCR_result, key=PCR_result.get))
        #
        PCR_model = PCA(**PCR_best_param)
        PCR_model.fit(X_train_all)
        PCR_comps = PCR_model.transform(X_train_all)
        PCR_forecast = LinearRegression()
        PCR_forecast.fit(PCR_comps, y_train_all)
        y_pred_PCR_1957.append(PCR_forecast.predict(PCR_model.transform(predictor[[t], 1:n_cols]))[0])

        # LASSO
        LASSO_param = {'alpha': list(10 ** np.arange(-4, 1 + 0.001, 0.2))}
        LASSO_result = {}
        for param in ParameterGrid(LASSO_param):
            LASSO = Lasso(**param)
            LASSO.fit(X_train, y_train)
            mse = median_absolute_error(LASSO.predict(X_validation), y_validation)
            LASSO_result[str(param)] = mse
        #
        LASSO_best_param = eval(min(LASSO_result, key=LASSO_result.get))
        #
        LASSO_model = Lasso(**LASSO_best_param)
        LASSO_model.fit(X_train_all, y_train_all)
        y_pred_LASSO_1957.append(LASSO_model.predict(predictor[[t], 1:n_cols])[0])

        # ENet
        ENet_param = {'alpha': list(10 ** np.arange(-4, 1 + 0.001, 0.2)),
                      'l1_ratio': list(np.arange(0.2, 1, 0.3))}
        ENet_result = {}
        for param in ParameterGrid(ENet_param):
            ENet = ElasticNet(**param)
            ENet.fit(X_train, y_train)
            mse = median_absolute_error(ENet.predict(X_validation), y_validation)
            ENet_result[str(param)] = mse

        ENet_best_param = eval(min(ENet_result, key=ENet_result.get))
        ENet_model = ElasticNet(**ENet_best_param)
        ENet_model.fit(X_train_all, y_train_all)
        y_pred_ENet_1957.append(ENet_model.predict(predictor[[t], 1:n_cols])[0])

        # GBRT
        GBRT_param = {'n_estimators': [10, 50, 100, 150, 200],
                      'max_depth': [2, 3, 4],
                      'min_samples_leaf': [1, 3, 5]}
        GBRT_result = {}
        for param in ParameterGrid(GBRT_param):
            GBRT = GradientBoostingRegressor(**param)
            GBRT.fit(X_train, y_train)
            mse = median_absolute_error(GBRT.predict(X_validation), y_validation)
            GBRT_result[str(param)] = mse

        GBRT_best_param = eval(min(GBRT_result, key=GBRT_result.get))
        GBRT_model = GradientBoostingRegressor(**GBRT_best_param)
        GBRT_model.fit(X_train_all, y_train_all)
        y_pred_GBRT_1957.append(GBRT_model.predict(predictor[[t], 1:n_cols])[0])

        # RF
        RF_param = {'n_estimators': [10, 50, 100, 150, 200],
                    'max_depth': [2, 3, 4],
                    'min_samples_leaf': [1, 3, 5]}
        RF_result = {}
        for param in ParameterGrid(RF_param):
            RF = RandomForestRegressor(**param)
            RF.fit(X_train, y_train)
            mse = median_absolute_error(RF.predict(X_validation), y_validation)
            RF_result[str(param)] = mse

        RF_best_param = eval(min(RF_result, key=RF_result.get))
        RF_model = RandomForestRegressor(**RF_best_param)
        RF_model.fit(X_train_all, y_train_all)
        y_pred_RF_1957.append(RF_model.predict(predictor[[t], 1:n_cols])[0])

        # Neural Network Models: NN1~NN5
        X_train_all_tensor = torch.tensor(X_train_all, dtype=torch.float)
        y_train_all_tensor = torch.tensor(y_train_all.reshape(-1, 1), dtype=torch.float)
        X_train_tensor = torch.tensor(X_train, dtype=torch.float)
        y_train_tensor = torch.tensor(y_train.reshape(-1, 1), dtype=torch.float)
        X_validation_tensor = torch.tensor(X_validation, dtype=torch.float)
        y_validation_tensor = torch.tensor(y_validation.reshape(-1, 1), dtype=torch.float)

        # NN1
        NN1_result = {}
        NN1_architecture = {"module__n_feature": X_train_tensor.shape[1],  # n_feature should be the number of predictors
                            "module__n_hidden1": 32,
                            "module__n_output": 1}
        NN1_param = {'module__dropout': [0.2, 0.4, 0.6, 0.8],
                    'lr': [0.001, 0.01],
                    'optimizer__weight_decay': [0.1, 0.01, 0.001]}
        for param in ParameterGrid(NN1_param):
            NN1 = NeuralNetRegressor(Net1, verbose=0, max_epochs=200,
                                     optimizer=torch.optim.SGD,
                                     **NN1_architecture, **param)
            NN1.fit(X_train_tensor, y_train_tensor)
            mse = median_absolute_error(NN1.predict(X_validation_tensor), y_validation)
            NN1_result[str(param)] = mse

        #
        NN1_best_param = eval(min(NN1_result, key=NN1_result.get))
        NN1_model = NeuralNetRegressor(Net1, verbose=0, max_epochs=200, optimizer=torch.optim.SGD,
                                       **NN1_architecture, **NN1_best_param)
        NN1_model.fit(X_train_all_tensor, y_train_all_tensor)
        y_pred_NN1_1957.append(NN1_model.predict(torch.tensor(predictor[[t], 1:n_cols], dtype=torch.float))[0][0])


        # NN2
        NN2_result = {}
        NN2_architecture = {"module__n_feature": X_train_tensor.shape[1],
                            "module__n_hidden1": 32, "module__n_hidden2": 16,
                            "module__n_output": 1}
        NN2_param = {'module__dropout': [0.2, 0.4, 0.6, 0.8],
                    'lr': [0.001, 0.01],
                    'optimizer__weight_decay': [0.1, 0.01, 0.001]}
        for param in ParameterGrid(NN2_param):
            NN2 = NeuralNetRegressor(Net2, verbose=0, max_epochs=200,
                                     optimizer=torch.optim.SGD,
                                     **NN2_architecture, **param)
            NN2.fit(X_train_tensor, y_train_tensor)
            mse = median_absolute_error(NN2.predict(X_validation_tensor), y_validation)
            NN2_result[str(param)] = mse
        
        #
        NN2_best_param = eval(min(NN2_result, key=NN2_result.get))
        NN2_model = NeuralNetRegressor(Net2, verbose=0, max_epochs=200, optimizer=torch.optim.SGD,
                                       **NN2_architecture, **NN2_best_param)
        NN2_model.fit(X_train_all_tensor, y_train_all_tensor)
        y_pred_NN2_1957.append(NN2_model.predict(torch.tensor(predictor[[t], 1:n_cols], dtype=torch.float))[0][0])
        #

        # NN3
        NN3_result = {}
        NN3_architecture = {"module__n_feature": X_train_tensor.shape[1],
                            # n_feature should be the number of predictors
                            "module__n_hidden1": 32, "module__n_hidden2": 16,
                            "module__n_hidden3": 8,
                            "module__n_output": 1}
        NN3_param = {'module__dropout': [0.2, 0.4, 0.6, 0.8],
                     'lr': [0.001, 0.01],
                     'optimizer__weight_decay': [0.1, 0.01, 0.001]}
        for param in ParameterGrid(NN3_param):
            NN3 = NeuralNetRegressor(Net3, verbose=0, max_epochs=200,
                                     optimizer=torch.optim.SGD,
                                     **NN3_architecture, **param)
            NN3.fit(X_train_tensor, y_train_tensor)
            mse = median_absolute_error(NN3.predict(X_validation_tensor), y_validation)
            NN3_result[str(param)] = mse

        #
        NN3_best_param = eval(min(NN3_result, key=NN3_result.get))
        NN3_model = NeuralNetRegressor(Net3, verbose=0, max_epochs=200, optimizer=torch.optim.SGD,
                                       **NN3_architecture, **NN3_best_param)
        NN3_model.fit(X_train_all_tensor, y_train_all_tensor)
        y_pred_NN3_1957.append(NN3_model.predict(torch.tensor(predictor[[t], 1:n_cols], dtype=torch.float))[0][0])
        #

        # NN4
        NN4_result = {}
        NN4_architecture = {"module__n_feature": X_train_tensor.shape[1],
                            "module__n_hidden1": 32, "module__n_hidden2": 16,
                            "module__n_hidden3": 8,  "module__n_hidden4": 4,
                            "module__n_output": 1}
        NN4_param = {'module__dropout': [0.2, 0.4, 0.6, 0.8],
                     'lr': [0.001, 0.01],
                     'optimizer__weight_decay': [0.1, 0.01, 0.001]}
        for param in ParameterGrid(NN4_param):
            NN4 = NeuralNetRegressor(Net4, verbose=0, max_epochs=200,
                                     optimizer=torch.optim.SGD,
                                     **NN4_architecture, **param)
            NN4.fit(X_train_tensor, y_train_tensor)
            mse = median_absolute_error(NN4.predict(X_validation_tensor), y_validation)
            NN4_result[str(param)] = mse

        #
        NN4_best_param = eval(min(NN4_result, key=NN4_result.get))
        NN4_model = NeuralNetRegressor(Net4, verbose=0, max_epochs=200, optimizer=torch.optim.SGD,
                                       **NN4_architecture, **NN4_best_param)
        NN4_model.fit(X_train_all_tensor, y_train_all_tensor)
        y_pred_NN4_1957.append(NN4_model.predict(torch.tensor(predictor[[t], 1:n_cols], dtype=torch.float))[0][0])
        #

        # NN5
        NN5_result = {}
        NN5_architecture = {"module__n_feature": X_train_tensor.shape[1],
                            "module__n_hidden1": 32, "module__n_hidden2": 16,
                            "module__n_hidden3": 8,  "module__n_hidden4": 4,
                            "module__n_hidden5": 2,
                            "module__n_output": 1}
        NN5_param = {'module__dropout': [0.2, 0.4, 0.6, 0.8],
                     'lr': [0.001, 0.01],
                     'optimizer__weight_decay': [0.1, 0.01, 0.001]}
        for param in ParameterGrid(NN5_param):
            NN5 = NeuralNetRegressor(Net5, verbose=0, max_epochs=200,
                                     optimizer=torch.optim.SGD,
                                     **NN5_architecture, **param)
            NN5.fit(X_train_tensor, y_train_tensor)
            mse = median_absolute_error(NN5.predict(X_validation_tensor), y_validation)
            NN5_result[str(param)] = mse

        #
        NN5_best_param = eval(min(NN5_result, key=NN5_result.get))
        NN5_model = NeuralNetRegressor(Net5, verbose=0, max_epochs=200, optimizer=torch.optim.SGD,
                                       **NN5_architecture, **NN5_best_param)
        NN5_model.fit(X_train_all_tensor, y_train_all_tensor)
        y_pred_NN5_1957.append(NN5_model.predict(torch.tensor(predictor[[t], 1:n_cols], dtype=torch.float))[0][0])
        #

        ## Other commmonly used ML methods
        # Ridge
        Ridge_param = {'alpha': list(10 ** np.arange(0, 20 + 0.001, 1))}
        Ridge_result = {}
        for param in ParameterGrid(Ridge_param):
            RIDGE = Ridge(**param)
            RIDGE.fit(X_train, y_train)
            mse = median_absolute_error(RIDGE.predict(X_validation), y_validation)
            Ridge_result[str(param)] = mse
        #
        Ridge_best_param = eval(min(Ridge_result, key=Ridge_result.get))
        Ridge_model = Ridge(**Ridge_best_param)
        Ridge_model.fit(X_train_all, y_train_all)
        y_pred_Ridge_1957.append(Ridge_model.predict(predictor[[t], 1:n_cols])[0])

        # SVR
        SVR_param = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'degree': [2, 3, 4], 'C': [0.1, 0.5, 1]}
        SVR_result = {}
        for param in ParameterGrid(SVR_param):
            SVR_tmp = SVR(**param)
            SVR_tmp.fit(X_train, y_train)
            mse = median_absolute_error(SVR_tmp.predict(X_validation), y_validation)
            SVR_result[str(param)] = mse
        SVR_best_param = eval(min(SVR_result, key=SVR_result.get))
        SVR_model = SVR(**SVR_best_param)
        SVR_model.fit(X_train_all, y_train_all)
        y_pred_SVR_1957.append(SVR_model.predict(predictor[[t], 1:n_cols])[0])

        # KNR
        KNR = KNeighborsRegressor()
        KNR_param = {'n_neighbors': [3, 4, 5, 6, 7], 'weights': ['distance', 'uniform'],
                     'leaf_size': [20, 30, 40], 'p': [1, 2, 3]}
        KNR_result = {}
        for param in ParameterGrid(KNR_param):
            KNR = KNeighborsRegressor(**param)
            KNR.fit(X_train, y_train)
            mse = median_absolute_error(KNR.predict(X_validation), y_validation)
            KNR_result[str(param)] = mse

        KNR_best_param = eval(min(KNR_result, key=KNR_result.get))
        KNR_model = KNeighborsRegressor(**KNR_best_param)
        KNR_model.fit(X_train_all, y_train_all)
        y_pred_KNR_1957.append(KNR_model.predict(predictor[[t], 1:n_cols])[0])


        # XGBoost
        XGBoost_param =  {'max_depth': [4, 5, 6, 7, 8], 'eta': [0.01, 0.1],
                          'lambda': [0, 0.5, 1], 'alpha': [0, 0.5, 1]}
        XGBoost_result = {}
        for param in ParameterGrid(XGBoost_param):
            XGBoost = XGBRegressor(**param)
            XGBoost.fit(X_train, y_train)
            mse = median_absolute_error(XGBoost.predict(X_validation), y_validation)
            XGBoost_result[str(param)] = mse

        XGB_best_param = eval(min(XGBoost_result, key=XGBoost_result.get))
        XGB_model = XGBRegressor(**XGB_best_param)
        XGB_model.fit(X_train_all, y_train_all)
        y_pred_XGBoost_1957.append(XGB_model.predict(predictor[[t], 1:n_cols])[0])
    else:
        month_index += 1
        y_pred_OLS_1957.append(OLS.predict(predictor[[t], 1:n_cols])[0])
        y_pred_PLS_1957.append(PLS_model.predict(predictor[[t], 1:n_cols])[0][0])
        y_pred_PCR_1957.append(PCR_forecast.predict(PCR_model.transform(predictor[[t], 1:n_cols]))[0])
        y_pred_LASSO_1957.append(LASSO_model.predict(predictor[[t], 1:n_cols])[0])
        y_pred_ENet_1957.append(ENet_model.predict(predictor[[t], 1:n_cols])[0])
        y_pred_GBRT_1957.append(GBRT_model.predict(predictor[[t], 1:n_cols])[0])
        y_pred_RF_1957.append(RF_model.predict(predictor[[t], 1:n_cols])[0])
        y_pred_NN1_1957.append(NN1_model.predict(torch.tensor(predictor[[t], 1:n_cols], dtype=torch.float))[0][0])
        y_pred_NN2_1957.append(NN2_model.predict(torch.tensor(predictor[[t], 1:n_cols], dtype=torch.float))[0][0])
        y_pred_NN3_1957.append(NN3_model.predict(torch.tensor(predictor[[t], 1:n_cols], dtype=torch.float))[0][0])
        y_pred_NN4_1957.append(NN4_model.predict(torch.tensor(predictor[[t], 1:n_cols], dtype=torch.float))[0][0])
        y_pred_NN5_1957.append(NN5_model.predict(torch.tensor(predictor[[t], 1:n_cols], dtype=torch.float))[0][0])
        # Other commonly used ML methods
        y_pred_Ridge_1957.append(Ridge_model.predict(predictor[[t], 1:n_cols])[0])
        y_pred_SVR_1957.append(SVR_model.predict(predictor[[t], 1:n_cols])[0])
        y_pred_KNR_1957.append(KNR_model.predict(predictor[[t], 1:n_cols])[0])
        y_pred_XGBoost_1957.append(XGB_model.predict(predictor[[t], 1:n_cols])[0])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 767/767 [12:21:54<00:00, 58.04s/it]


In [9]:
y_ml_pred = pd.DataFrame(np.array([y_pred_OLS_1957, y_pred_PLS_1957, y_pred_PCR_1957, y_pred_LASSO_1957,
                                   y_pred_ENet_1957, y_pred_GBRT_1957, y_pred_RF_1957, y_pred_NN1_1957,
                                   y_pred_NN2_1957, y_pred_NN3_1957, y_pred_NN4_1957, y_pred_NN5_1957,
                                   y_pred_Ridge_1957, y_pred_SVR_1957, y_pred_KNR_1957, y_pred_XGBoost_1957]),
                         index=['OLS', 'PLS', 'PCR', 'LASSO', 'ENet', 'GBRT', 'RF', 'NN1',
                                'NN2', 'NN3', 'NN4', 'NN5', 'Ridge', 'SVR', 'KNR', 'XGBoost'],
                         columns=predictor_df.month[in_out_1957:N]).T
y_ml_pred['Combined'] = y_ml_pred.mean(axis=1)
y_ml_pred.head()

Unnamed: 0_level_0,OLS,PLS,PCR,LASSO,ENet,GBRT,RF,NN1,NN2,NN3,NN4,NN5,Ridge,SVR,KNR,XGBoost,Combined
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
195701,0.004428,0.017212,0.007213,0.006473,0.006473,0.006212,0.008328,-0.020846,0.009036,-0.044783,0.020064,0.011484,0.006473,0.0517,0.032029,0.007727,0.008077
195702,0.005713,0.008221,0.007902,0.006473,0.006473,0.006212,0.010835,0.033707,0.009018,-0.059487,0.016915,0.011484,0.006473,-0.022794,0.011661,-0.00494,0.003367
195703,0.000727,0.013473,0.008082,0.006473,0.006473,0.006212,0.010835,0.006636,0.006358,-0.06282,0.016444,0.011484,0.006473,-0.001442,0.010643,-0.004789,0.002579
195704,-0.007605,0.007329,0.009294,0.006473,0.006473,0.007569,0.011015,-0.056333,-0.003324,-0.04918,0.016485,0.011484,0.006473,-0.030523,-0.002674,-0.00562,-0.004542
195705,-0.002662,0.002282,0.009749,0.006473,0.006473,0.012371,0.011293,0.021646,-0.002817,-0.031841,0.017856,0.011484,0.006473,-0.039166,0.013793,-0.007002,0.002275


In [10]:
# Performance compared with HA benchmark

def compute_oos_r_square(actual, y_benchmark, y_pred):
    MSFE_benchmark = mean_squared_error(y_benchmark, actual)
    MSFE_pred = mean_squared_error(y_pred, actual)
    return 1 - MSFE_pred / MSFE_benchmark

ml_oos_performance = []

for col in y_ml_pred.columns:
    oos_r_square = compute_oos_r_square(actual_1957, y_pred_HA_1957, y_ml_pred[[col]].to_numpy())
    MSFE_adjusted, pvalue_MSFE = CW_test(actual_1957, y_pred_HA_1957, y_ml_pred[[col]].to_numpy())
    success_ratio, PT_stat, pvalue_PT = PT_test(actual_1957, y_ml_pred[[col]].to_numpy())
    ml_oos_performance.append([oos_r_square * 100, MSFE_adjusted, pvalue_MSFE, success_ratio * 100, PT_stat, pvalue_PT])


ml_oos_performance_df = pd.DataFrame(np.array(ml_oos_performance),
                                     index=y_ml_pred.columns,
                                     columns=['oos_r_square', 'MSFE_adjusted', 'pvalue_MSFE',
                                              'success_ratio', 'PT_stat', 'pvalue_PT'])
# success ratio of HA
success_ratio_HA_1957, PT_HA_1957, p2_HA_1957 = PT_test(actual_1957, y_pred_HA_1957)
ml_oos_performance_df.loc['HA'] = [0, np.nan, np.nan, success_ratio_HA_1957 * 100, PT_HA_1957, p2_HA_1957]
ml_oos_performance_df

  stat = (p_hat - p_star) / np.sqrt(p_hat_var - p_star_var)


Unnamed: 0,oos_r_square,MSFE_adjusted,pvalue_MSFE,success_ratio,PT_stat,pvalue_PT
OLS,-12.679806,0.613661,0.26972,56.062581,2.2,0.013903
PLS,-4.702669,0.172314,0.431595,55.671447,0.719744,0.235841
PCR,0.072784,1.925798,0.027065,59.713168,3.112951,0.000926
LASSO,-0.696281,1.197421,0.115571,58.800522,2.63233,0.00424
ENet,-1.772084,0.771034,0.220343,58.9309,2.653842,0.003979
GBRT,-22.579738,-0.475949,0.682945,56.584094,-0.867959,0.807292
RF,-16.507013,0.337619,0.367825,56.19296,-0.965328,0.83281
NN1,-26.869047,-1.028946,0.848247,58.409387,1.460584,0.072065
NN2,-13.608669,1.334754,0.090978,57.105606,0.634978,0.262721
NN3,-20.497497,-0.112818,0.544913,55.932203,0.410294,0.340795


In [11]:
import openpyxl
with pd.ExcelWriter("ml_equity_premium_results.xlsx", engine='openpyxl', mode='a') as writer:
    ml_oos_performance_df.to_excel(writer, sheet_name='using_robust_scoring_function')