### Machine Learning Out-of-sample Forecasting

In [1]:
import os
os.getcwd()
# os.chdir(path)    # or you can set your working dir.

'/Users/xingfuxu/PycharmProjects/EquityPremiumPredictionML-Jupyter'

In [2]:
# Your working dir should include "NN_models.py"
from NN_models import Net1, Net2, Net3, Net4, Net5

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import ParameterGrid
import torch
from skorch import NeuralNetRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

#### Generate macro variables and technical indicators

In [4]:
predictor_raw = pd.read_excel(open('ml_equity_premium_data.xlsx', 'rb'), sheet_name='PredictorData1926-2020')
# predictor_raw.tail()
# predictor_raw.columns


## Generating equity risk premium, 1927:01-2020:12
n_rows = predictor_raw.shape[0]
market_return = predictor_raw['CRSP_SPvw'][1:].values
risk_free_lag = predictor_raw['Rfree'][0:(n_rows - 1)].values
log_equity_premium = np.log(1 + market_return) - np.log(1 + risk_free_lag)
equity_premium = market_return - risk_free_lag


### Generating 12 macroeconomic variables, 1927:01-2020:12
# Notes: we exclude the log dividend-earnings ratio (DE) and the long-term yield (LTY).

# (1) Dividend-price ratio (log), DP
D12 = predictor_raw['D12'][1:].values
SP500 = predictor_raw['Index'][1:].values
DP = np.log(D12) - np.log(SP500)

# (2) Dividend yield (log), DY
SP500_lag = predictor_raw['Index'][0:(n_rows - 1)].values
DY = np.log(D12) - np.log(SP500_lag)

# (3) Earnings-price ratio (log), EP
E12 = predictor_raw['E12'][1:].values
EP = np.log(E12) - np.log(SP500)

# (4) stock variance, SVAR
SVAR = predictor_raw['svar'][1:].values

# (5) Book-to-market ratio, BM
BM = predictor_raw['b/m'][1:].values

# (6) Net equity expansion, NTIS
NTIS = predictor_raw['ntis'][1:].values

# (7) Treasury bill rate (annual %), TBL
TBL = predictor_raw['tbl'][1:].values
TBL = 100 * TBL

# (8) Long-term return (%), LTR
LTR = predictor_raw['ltr'][1:].values
LTR = 100 * LTR

# (9) Term spread (annual %), TMS
LTY = predictor_raw['lty'][1:].values
LTY = 100 * LTY
TMS = LTY - TBL

# (10) Default yield spread, DFY
AAA = predictor_raw['AAA'][1:].values
BAA = predictor_raw['BAA'][1:].values
DFY = 100 * (BAA - AAA)

# (11) Default return spread, DFR
CORPR = predictor_raw['corpr'][1:].values
DFR = 100 * CORPR - LTR

# (12) Inflation (%, lagged), INFL
INFL = predictor_raw['infl'][0:(n_rows - 1)].values
INFL = 100 * INFL

## Collect 12 macroeconomic variables
macro = np.concatenate([DP.reshape(-1, 1), DY.reshape(-1, 1), EP.reshape(-1, 1),
                        SVAR.reshape(-1, 1), BM.reshape(-1, 1), NTIS.reshape(-1, 1),
                        TBL.reshape(-1, 1), LTR.reshape(-1, 1), TMS.reshape(-1, 1),
                        DFY.reshape(-1, 1), DFR.reshape(-1, 1), INFL.reshape(-1, 1)], axis=1)
# macro.shape
#


## Collect 12 technical indicators
technical = predictor_raw[['MA_1_9', 'MA_1_12', 'MA_2_9', 'MA_2_12', 'MA_3_9',
                           'MA_3_12', 'MOM_1', 'MOM_2', 'MOM_3', 'MOM_6', 'MOM_9',
                           'MOM_12']].values[1:]
# technical.shape

#
predictor_matrix = np.concatenate([predictor_raw['yyyymm'][1:].values.reshape(-1, 1), log_equity_premium.reshape(-1, 1),
                     equity_premium.reshape(-1, 1), macro, technical], axis=1)

#
result_predictor = pd.DataFrame(predictor_matrix,
                                columns=['month', 'log_equity_premium', 'equity_premium', 'DP', 'DY', 'EP', 'SVAR',
                                         'BM', 'NTIS', 'TBL', 'LTR', 'TMS', 'DFY', 'DFR', 'INFL','MA_1_9', 'MA_1_12',
                                         'MA_2_9', 'MA_2_12', 'MA_3_9', 'MA_3_12', 'MOM_1', 'MOM_2', 'MOM_3', 'MOM_6',
                                         'MOM_9', 'MOM_12'])
result_predictor

Unnamed: 0,month,log_equity_premium,equity_premium,DP,DY,EP,SVAR,BM,NTIS,TBL,...,MA_2_9,MA_2_12,MA_3_9,MA_3_12,MOM_1,MOM_2,MOM_3,MOM_6,MOM_9,MOM_12
0,192701.0,-0.005710,-0.00571,-2.942374,-2.963349,-2.374773,0.00047,0.44371,0.05082,3.23,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
1,192702.0,0.042017,0.04302,-2.979535,-2.932946,-2.430353,0.00029,0.42850,0.05167,3.29,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,192703.0,0.004697,0.00472,-2.976535,-2.970053,-2.445079,0.00092,0.46977,0.04636,3.20,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,192704.0,0.009940,0.01002,-2.984225,-2.967143,-2.471309,0.00060,0.45675,0.05051,3.39,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,192705.0,0.057987,0.05985,-3.025963,-2.975058,-2.531446,0.00039,0.43478,0.05528,3.33,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1123,202008.0,0.069491,0.07197,-4.080892,-4.013173,-3.569975,0.00074,0.23597,-0.00850,0.10,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1124,202009.0,-0.038997,-0.03825,-4.045576,-4.085595,-3.533379,0.00491,0.24148,-0.00570,0.11,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
1125,202010.0,-0.026865,-0.02651,-4.020768,-4.048824,-3.519300,0.00366,0.25315,-0.00190,0.10,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
1126,202011.0,0.103719,0.10930,-4.126173,-4.024026,-3.635623,0.00249,0.22635,-0.00526,0.09,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
# save
import openpyxl
with pd.ExcelWriter("ml_equity_premium_data.xlsx", engine='openpyxl', mode='a') as writer:
    result_predictor.to_excel(writer, sheet_name='result_predictor', index=False)

In [6]:
# set seed
np.random.seed(12)
torch.manual_seed(12)

# read data
predictor_df = pd.read_excel(open('ml_equity_premium_data.xlsx', 'rb'), sheet_name='result_predictor')
predictor_df

Unnamed: 0,month,log_equity_premium,equity_premium,DP,DY,EP,SVAR,BM,NTIS,TBL,...,MA_2_9,MA_2_12,MA_3_9,MA_3_12,MOM_1,MOM_2,MOM_3,MOM_6,MOM_9,MOM_12
0,192701,-0.005710,-0.00571,-2.942374,-2.963349,-2.374773,0.00047,0.44371,0.05082,3.23,...,1,1,1,1,0,0,1,1,1,1
1,192702,0.042017,0.04302,-2.979535,-2.932946,-2.430353,0.00029,0.42850,0.05167,3.29,...,1,1,1,1,1,1,1,1,1,1
2,192703,0.004697,0.00472,-2.976535,-2.970053,-2.445079,0.00092,0.46977,0.04636,3.20,...,1,1,1,1,1,1,1,1,1,1
3,192704,0.009940,0.01002,-2.984225,-2.967143,-2.471309,0.00060,0.45675,0.05051,3.39,...,1,1,1,1,1,1,1,1,1,1
4,192705,0.057987,0.05985,-3.025963,-2.975058,-2.531446,0.00039,0.43478,0.05528,3.33,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1123,202008,0.069491,0.07197,-4.080892,-4.013173,-3.569975,0.00074,0.23597,-0.00850,0.10,...,1,1,1,1,1,1,1,1,1,1
1124,202009,-0.038997,-0.03825,-4.045576,-4.085595,-3.533379,0.00491,0.24148,-0.00570,0.11,...,1,1,1,1,0,1,1,1,1,1
1125,202010,-0.026865,-0.02651,-4.020768,-4.048824,-3.519300,0.00366,0.25315,-0.00190,0.10,...,1,1,1,1,0,0,0,1,1,1
1126,202011,0.103719,0.10930,-4.126173,-4.024026,-3.635623,0.00249,0.22635,-0.00526,0.09,...,1,1,1,1,1,1,1,1,1,1


In [7]:
# remove irrelavent columns
predictor0 = predictor_df.drop(['month', 'equity_premium'], axis=1)
# get all the predictors and set the log equity premium 1-month ahead
predictor = np.concatenate([predictor0['log_equity_premium'][1:].values.reshape(-1, 1),
                            predictor0.iloc[0:(predictor0.shape[0] - 1), 1:]], axis=1)

# number of rows
N = predictor.shape[0]

# number of all columns, including the log equity premium
n_cols = predictor.shape[1]

# Actual one-month ahead log equity premium
actual = predictor[:, [0]]

# Historical average forecasting as benchmark
y_pred_HA = predictor0['log_equity_premium'].values[0:(predictor0.shape[0] - 1), ].cumsum() / np.arange(1, N + 1)
y_pred_HA = y_pred_HA.reshape(-1, 1)

In [8]:
# set the training data to full sample
X_train_all = predictor[:, 1:n_cols]
y_train_all = predictor[:, 0]
# set 15% of all the train data as validation set
X_train = X_train_all[0:int(len(X_train_all) * 0.85), :]
X_validation = X_train_all[int(len(X_train_all) * 0.85):, :]
y_train = y_train_all[0:int(len(X_train_all) * 0.85)]
y_validation = y_train_all[int(len(X_train_all) * 0.85):]

In [9]:
# OLS
print("Starting OLS ...")
OLS = LinearRegression()
OLS.fit(X_train_all, y_train_all)
y_in_pred_OLS = OLS.predict(X_train_all).reshape(-1, 1)

# PLS
print("Starting PLS ...")
PLS_param = {'n_components': [1, 2, 3, 4, 5, 6, 7, 8]}
PLS_result = {}
for param in ParameterGrid(PLS_param):
    PLS = PLSRegression(**param)
    PLS.fit(X_train, y_train)
    mse = mean_squared_error(PLS.predict(X_validation), y_validation)
    PLS_result[str(param)] = mse

PLS_best_param = eval(min(PLS_result, key=PLS_result.get))
PLS_model = PLSRegression(**PLS_best_param)
PLS_model.fit(X_train_all, y_train_all)
y_in_pred_PLS = PLS_model.predict(X_train_all).reshape(-1, 1)

# PCR
print("Starting PCR ...")
PCR_param = {'n_components': [1, 2, 3, 4, 5, 6, 7, 8]}
PCR_result = {}
for param in ParameterGrid(PCR_param):
    pca = PCA(**param)
    pca.fit(X_train)
    comps = pca.transform(X_train)
    forecast = LinearRegression()
    forecast.fit(comps, y_train)
    mse = mean_squared_error(forecast.predict(pca.transform(X_validation)), y_validation)
    PCR_result[str(param)] = mse
#
PCR_best_param = eval(min(PCR_result, key=PCR_result.get))
#
PCR_model = PCA(**PCR_best_param)
PCR_model.fit(X_train_all)
PCR_comps = PCR_model.transform(X_train_all)
PCR_forecast = LinearRegression()
PCR_forecast.fit(PCR_comps, y_train_all)
y_in_pred_PCR = PCR_forecast.predict(PCR_model.transform(X_train_all)).reshape(-1, 1)

# LASSO
print("Starting LASSO...")
LASSO_param = {'alpha': list(10 ** np.arange(-4, 1 + 0.001, 0.2))}
LASSO_result = {}
for param in ParameterGrid(LASSO_param):
    LASSO = Lasso(**param)
    LASSO.fit(X_train, y_train)
    mse = mean_squared_error(LASSO.predict(X_validation), y_validation)
    LASSO_result[str(param)] = mse
#
LASSO_best_param = eval(min(LASSO_result, key=LASSO_result.get))
#
LASSO_model = Lasso(**LASSO_best_param)
LASSO_model.fit(X_train_all, y_train_all)
y_in_pred_LASSO = LASSO_model.predict(X_train_all).reshape(-1, 1)

# ENet
print("Starting ENet ...")
ENet_param = {'alpha': list(10 ** np.arange(-4, 1 + 0.001, 0.2)),
              'l1_ratio': list(np.arange(0.2, 1, 0.3))}
ENet_result = {}
for param in ParameterGrid(ENet_param):
    ENet = ElasticNet(**param)
    ENet.fit(X_train, y_train)
    mse = mean_squared_error(ENet.predict(X_validation), y_validation)
    ENet_result[str(param)] = mse

ENet_best_param = eval(min(ENet_result, key=ENet_result.get))
ENet_model = ElasticNet(**ENet_best_param)
ENet_model.fit(X_train_all, y_train_all)
y_in_pred_ENet = ENet_model.predict(X_train_all).reshape(-1, 1)

# GBRT
print("Starting GBRT ...")
GBRT_param = {'n_estimators': [10, 50, 100, 150, 200],
              'max_depth': [2, 3, 4],
              'min_samples_leaf': [1, 3, 5]}
GBRT_result = {}
for param in ParameterGrid(GBRT_param):
    GBRT = GradientBoostingRegressor(**param)
    GBRT.fit(X_train, y_train)
    mse = mean_squared_error(GBRT.predict(X_validation), y_validation)
    GBRT_result[str(param)] = mse

GBRT_best_param = eval(min(GBRT_result, key=GBRT_result.get))
GBRT_model = GradientBoostingRegressor(**GBRT_best_param)
GBRT_model.fit(X_train_all, y_train_all)
y_in_pred_GBRT = GBRT_model.predict(X_train_all).reshape(-1, 1)

# RF
print("Starting RF ...")
RF_param = {'n_estimators': [10, 50, 100, 150, 200],
            'max_depth': [2, 3, 4],
            'min_samples_leaf': [1, 3, 5]}
RF_result = {}
for param in ParameterGrid(RF_param):
    RF = RandomForestRegressor(**param)
    RF.fit(X_train, y_train)
    mse = mean_squared_error(RF.predict(X_validation), y_validation)
    RF_result[str(param)] = mse

RF_best_param = eval(min(RF_result, key=RF_result.get))
RF_model = RandomForestRegressor(**RF_best_param)
RF_model.fit(X_train_all, y_train_all)
y_in_pred_RF = RF_model.predict(X_train_all).reshape(-1, 1)

# Neural Network Models: NN1~NN5
print("Starting Neural Network Models ...")
X_train_all_tensor = torch.tensor(X_train_all, dtype=torch.float)
y_train_all_tensor = torch.tensor(y_train_all.reshape(-1, 1), dtype=torch.float)
X_train_tensor = torch.tensor(X_train, dtype=torch.float)
y_train_tensor = torch.tensor(y_train.reshape(-1, 1), dtype=torch.float)
X_validation_tensor = torch.tensor(X_validation, dtype=torch.float)
y_validation_tensor = torch.tensor(y_validation.reshape(-1, 1), dtype=torch.float)

# NN1
NN1_result = {}
NN1_architecture = {"module__n_feature": X_train_tensor.shape[1],  # n_feature should be the number of predictors
                    "module__n_hidden1": 32,
                    "module__n_output": 1}
NN1_param = {'module__dropout': [0.2, 0.4, 0.6, 0.8],
            'lr': [0.001, 0.01],
            'optimizer__weight_decay': [0.1, 0.01, 0.001]}
for param in ParameterGrid(NN1_param):
    NN1 = NeuralNetRegressor(Net1, verbose=0, max_epochs=200,
                             optimizer=torch.optim.SGD,
                             **NN1_architecture, **param)
    NN1.fit(X_train_tensor, y_train_tensor)
    mse = mean_squared_error(NN1.predict(X_validation_tensor), y_validation)
    NN1_result[str(param)] = mse

#
NN1_best_param = eval(min(NN1_result, key=NN1_result.get))
NN1_model = NeuralNetRegressor(Net1, verbose=0, max_epochs=200, optimizer=torch.optim.SGD,
                               **NN1_architecture, **NN1_best_param)
NN1_model.fit(X_train_all_tensor, y_train_all_tensor)
y_in_pred_NN1 = NN1_model.predict(torch.tensor(X_train_all, dtype=torch.float)).reshape(-1, 1)


# NN2
NN2_result = {}
NN2_architecture = {"module__n_feature": X_train_tensor.shape[1],
                    "module__n_hidden1": 32, "module__n_hidden2": 16,
                    "module__n_output": 1}
NN2_param = {'module__dropout': [0.2, 0.4, 0.6, 0.8],
            'lr': [0.001, 0.01],
            'optimizer__weight_decay': [0.1, 0.01, 0.001]}
for param in ParameterGrid(NN2_param):
    NN2 = NeuralNetRegressor(Net2, verbose=0, max_epochs=200,
                             optimizer=torch.optim.SGD,
                             **NN2_architecture, **param)
    NN2.fit(X_train_tensor, y_train_tensor)
    mse = mean_squared_error(NN2.predict(X_validation_tensor), y_validation)
    NN2_result[str(param)] = mse

#
NN2_best_param = eval(min(NN2_result, key=NN2_result.get))
NN2_model = NeuralNetRegressor(Net2, verbose=0, max_epochs=200, optimizer=torch.optim.SGD,
                               **NN2_architecture, **NN2_best_param)
NN2_model.fit(X_train_all_tensor, y_train_all_tensor)
y_in_pred_NN2 = NN2_model.predict(torch.tensor(X_train_all, dtype=torch.float)).reshape(-1, 1)
#

# NN3
NN3_result = {}
NN3_architecture = {"module__n_feature": X_train_tensor.shape[1],
                    # n_feature should be the number of predictors
                    "module__n_hidden1": 32, "module__n_hidden2": 16,
                    "module__n_hidden3": 8,
                    "module__n_output": 1}
NN3_param = {'module__dropout': [0.2, 0.4, 0.6, 0.8],
             'lr': [0.001, 0.01],
             'optimizer__weight_decay': [0.1, 0.01, 0.001]}
for param in ParameterGrid(NN3_param):
    NN3 = NeuralNetRegressor(Net3, verbose=0, max_epochs=200,
                             optimizer=torch.optim.SGD,
                             **NN3_architecture, **param)
    NN3.fit(X_train_tensor, y_train_tensor)
    mse = mean_squared_error(NN3.predict(X_validation_tensor), y_validation)
    NN3_result[str(param)] = mse

#
NN3_best_param = eval(min(NN3_result, key=NN3_result.get))
NN3_model = NeuralNetRegressor(Net3, verbose=0, max_epochs=200, optimizer=torch.optim.SGD,
                               **NN3_architecture, **NN3_best_param)
NN3_model.fit(X_train_all_tensor, y_train_all_tensor)
y_in_pred_NN3 = NN3_model.predict(torch.tensor(X_train_all, dtype=torch.float)).reshape(-1, 1)
#

# NN4
NN4_result = {}
NN4_architecture = {"module__n_feature": X_train_tensor.shape[1],
                    "module__n_hidden1": 32, "module__n_hidden2": 16,
                    "module__n_hidden3": 8,  "module__n_hidden4": 4,
                    "module__n_output": 1}
NN4_param = {'module__dropout': [0.2, 0.4, 0.6, 0.8],
             'lr': [0.001, 0.01],
             'optimizer__weight_decay': [0.1, 0.01, 0.001]}
for param in ParameterGrid(NN4_param):
    NN4 = NeuralNetRegressor(Net4, verbose=0, max_epochs=200,
                             optimizer=torch.optim.SGD,
                             **NN4_architecture, **param)
    NN4.fit(X_train_tensor, y_train_tensor)
    mse = mean_squared_error(NN4.predict(X_validation_tensor), y_validation)
    NN4_result[str(param)] = mse

#
NN4_best_param = eval(min(NN4_result, key=NN4_result.get))
NN4_model = NeuralNetRegressor(Net4, verbose=0, max_epochs=200, optimizer=torch.optim.SGD,
                               **NN4_architecture, **NN4_best_param)
NN4_model.fit(X_train_all_tensor, y_train_all_tensor)
y_in_pred_NN4 = NN4_model.predict(torch.tensor(X_train_all, dtype=torch.float)).reshape(-1, 1)
#

# NN5
NN5_result = {}
NN5_architecture = {"module__n_feature": X_train_tensor.shape[1],
                    "module__n_hidden1": 32, "module__n_hidden2": 16,
                    "module__n_hidden3": 8,  "module__n_hidden4": 4,
                    "module__n_hidden5": 2,
                    "module__n_output": 1}
NN5_param = {'module__dropout': [0.2, 0.4, 0.6, 0.8],
             'lr': [0.001, 0.01],
             'optimizer__weight_decay': [0.1, 0.01, 0.001]}
for param in ParameterGrid(NN5_param):
    NN5 = NeuralNetRegressor(Net5, verbose=0, max_epochs=200,
                             optimizer=torch.optim.SGD,
                             **NN5_architecture, **param)
    NN5.fit(X_train_tensor, y_train_tensor)
    mse = mean_squared_error(NN5.predict(X_validation_tensor), y_validation)
    NN5_result[str(param)] = mse

#
NN5_best_param = eval(min(NN5_result, key=NN5_result.get))
NN5_model = NeuralNetRegressor(Net5, verbose=0, max_epochs=200, optimizer=torch.optim.SGD,
                               **NN5_architecture, **NN5_best_param)
NN5_model.fit(X_train_all_tensor, y_train_all_tensor)
y_in_pred_NN5 = NN5_model.predict(torch.tensor(X_train_all, dtype=torch.float)).reshape(-1, 1)
#

## Other commmonly used ML methods
# Ridge
print("Starting Ridge ...")
Ridge_param = {'alpha': list(10 ** np.arange(0, 20 + 0.001, 1))}
Ridge_result = {}
for param in ParameterGrid(Ridge_param):
    RIDGE = Ridge(**param)
    RIDGE.fit(X_train, y_train)
    mse = mean_squared_error(RIDGE.predict(X_validation), y_validation)
    Ridge_result[str(param)] = mse
#
Ridge_best_param = eval(min(Ridge_result, key=Ridge_result.get))
Ridge_model = Ridge(**Ridge_best_param)
Ridge_model.fit(X_train_all, y_train_all)
y_in_pred_Ridge = Ridge_model.predict(X_train_all).reshape(-1, 1)

# SVR
print("Starting SVR ...")
SVR_param = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'degree': [2, 3, 4], 'C': [0.1, 0.5, 1]}
SVR_result = {}
for param in ParameterGrid(SVR_param):
    SVR_tmp = SVR(**param)
    SVR_tmp.fit(X_train, y_train)
    mse = mean_squared_error(SVR_tmp.predict(X_validation), y_validation)
    SVR_result[str(param)] = mse
SVR_best_param = eval(min(SVR_result, key=SVR_result.get))
SVR_model = SVR(**SVR_best_param)
SVR_model.fit(X_train_all, y_train_all)
y_in_pred_SVR = SVR_model.predict(X_train_all).reshape(-1, 1)

# KNR
print("Starting KNR ...")
KNR = KNeighborsRegressor()
KNR_param = {'n_neighbors': [3, 4, 5, 6, 7], 'weights': ['distance', 'uniform'],
             'leaf_size': [20, 30, 40], 'p': [1, 2, 3]}
KNR_result = {}
for param in ParameterGrid(KNR_param):
    KNR = KNeighborsRegressor(**param)
    KNR.fit(X_train, y_train)
    mse = mean_squared_error(KNR.predict(X_validation), y_validation)
    KNR_result[str(param)] = mse

KNR_best_param = eval(min(KNR_result, key=KNR_result.get))
KNR_model = KNeighborsRegressor(**KNR_best_param)
KNR_model.fit(X_train_all, y_train_all)
y_in_pred_KNR = KNR_model.predict(X_train_all).reshape(-1, 1)


# XGBoost
print("Starting XGBoost ...")
XGBoost_param =  {'max_depth': [4, 5, 6, 7, 8], 'eta': [0.01, 0.1],
                  'lambda': [0, 0.5, 1], 'alpha': [0, 0.5, 1]}
XGBoost_result = {}
for param in ParameterGrid(XGBoost_param):
    XGBoost = XGBRegressor(**param)
    XGBoost.fit(X_train, y_train)
    mse = mean_squared_error(XGBoost.predict(X_validation), y_validation)
    XGBoost_result[str(param)] = mse

XGB_best_param = eval(min(XGBoost_result, key=XGBoost_result.get))
XGB_model = XGBRegressor(**XGB_best_param)
XGB_model.fit(X_train_all, y_train_all)
y_in_pred_XGBoost = XGB_model.predict(X_train_all).reshape(-1, 1)

Starting OLS ...
Starting PLS ...
Starting PCR ...
Starting LASSO...
Starting ENet ...
Starting GBRT ...
Starting RF ...
Starting Neural Network Models ...
Starting Ridge ...
Starting SVR ...
Starting KNR ...
Starting XGBoost ...


In [10]:
y_ml_in_array = np.concatenate([y_in_pred_OLS, y_in_pred_PLS, y_in_pred_PCR, y_in_pred_LASSO,
                                   y_in_pred_ENet, y_in_pred_GBRT, y_in_pred_RF, y_in_pred_NN1,
                                   y_in_pred_NN2, y_in_pred_NN3, y_in_pred_NN4, y_in_pred_NN5,
                                   y_in_pred_Ridge, y_in_pred_SVR, y_in_pred_KNR, y_in_pred_XGBoost], axis=1)

In [11]:
y_ml_in_pred = pd.DataFrame(y_ml_in_array,
                            columns=['OLS', 'PLS', 'PCR', 'LASSO', 'ENet', 'GBRT', 'RF', 'NN1',
                                'NN2', 'NN3', 'NN4', 'NN5', 'Ridge', 'SVR', 'KNR', 'XGBoost'],
                            index=predictor_df.month[:N])
y_ml_in_pred['Combined'] = y_ml_in_pred.mean(axis=1)
y_ml_in_pred['HA'] = y_pred_HA.ravel()
y_ml_in_pred

Unnamed: 0_level_0,OLS,PLS,PCR,LASSO,ENet,GBRT,RF,NN1,NN2,NN3,NN4,NN5,Ridge,SVR,KNR,XGBoost,Combined,HA
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
192701,0.009202,0.009648,0.005764,0.005956,0.005364,0.005499,0.006797,0.009285,0.010538,0.008334,0.007474,0.009078,0.008233,-0.007226,0.039649,0.027951,0.010097,-0.005710
192702,0.004380,0.010465,0.007155,0.006050,0.005456,-0.000099,-0.016054,0.008263,0.011065,0.008335,0.006241,0.009078,0.007794,-0.014032,0.016533,0.009114,0.004984,0.018154
192703,-0.002157,0.010130,0.005868,0.005457,0.005066,0.005499,0.006797,0.008238,0.011121,0.008334,0.006754,0.009078,0.007206,-0.022366,0.023124,0.007397,0.005972,0.013668
192704,0.000146,0.010147,0.007508,0.006225,0.005554,0.005499,0.006797,0.007824,0.011017,0.008334,0.004626,0.009078,0.007585,-0.008991,0.002476,0.006952,0.005673,0.012736
192705,-0.002836,0.009058,0.004528,0.004476,0.004208,0.005499,0.006797,0.006729,0.009890,0.008334,0.008107,0.009078,0.005689,-0.019990,0.008594,0.012183,0.005022,0.021786
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202007,0.017213,0.009551,0.023311,0.016457,0.014564,0.005499,0.008094,0.016089,0.011787,0.008329,0.007517,0.009078,0.015512,-0.011391,0.029582,0.016882,0.012380,0.005218
202008,-0.006437,0.006043,-0.000628,0.000752,0.001497,0.005499,-0.011331,0.008112,0.008588,0.008333,0.006171,0.009078,-0.001269,-0.014053,0.005672,-0.017673,0.000522,0.005275
202009,-0.003173,0.006459,0.009180,0.007654,0.007312,0.005499,0.008094,0.012772,0.011439,0.008330,0.009184,0.009078,0.005946,-0.019148,-0.001256,-0.011811,0.004097,0.005236
202010,0.010124,0.006452,0.004954,0.005367,0.005292,0.005499,0.008094,0.012940,0.010679,0.008334,0.004850,0.009078,0.004779,0.003784,0.025179,0.026451,0.009491,0.005207


In [12]:
# 1. Report the in-sample R squares for comparison with historical mean
def compute_in_r_square(actual, y_benchmark, y_pred):
    MSFE_benchmark = mean_squared_error(y_benchmark, actual)
    MSFE_pred = mean_squared_error(y_pred, actual)
    return 1 - MSFE_pred / MSFE_benchmark

# 2. Report success ratio
def compute_success_ratio(actual, y_pred):
    return np.sum(actual * y_pred > 0) / len(actual)


ml_in_performance = []

for col in y_ml_in_pred.columns:
    oos_r_square = compute_in_r_square(actual, y_pred_HA, y_ml_in_pred[[col]].to_numpy())
    success_ratio = compute_success_ratio(actual, y_ml_in_pred[[col]].to_numpy())
    ml_in_performance.append([oos_r_square * 100, success_ratio * 100])

    
ml_in_performance_df = pd.DataFrame(np.array(ml_in_performance), index=y_ml_in_pred.columns,
                                     columns=['in_r_square(%)', 'success_ratio(%)'])
ml_in_performance_df

Unnamed: 0,in_r_square(%),success_ratio(%)
OLS,5.509803,60.514641
PLS,2.216518,60.070985
PCR,2.046334,59.094942
LASSO,1.970166,60.248447
ENet,1.681969,60.425909
GBRT,10.725015,60.958296
RF,9.767483,60.692103
NN1,1.39371,60.425909
NN2,0.327844,60.248447
NN3,0.613812,60.070985


In [13]:
with pd.ExcelWriter("ml_equity_premium_in_sample_results.xlsx") as writer:
    ml_in_performance_df.to_excel(writer, sheet_name='in_sample_performance')