Github resource from the author: https://github.com/xiubooth/ML_Codes/tree/master/Simu_Matlab

In [1]:
import numpy as np
import pandas as pd
import gc # Use gc.collect() to release memory usage
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('GKX_20201231.csv')

In [3]:
start_date, end_date = 19570131, 20161231

In [4]:
# This is all training data that will be used for recursive training
data = data[(data['DATE'] >= start_date) & (data['DATE'] <= end_date)].reset_index(drop=True)

# Change date format for grouping later; offsets.MonthEnd(0) means dates need not be adjusted. They were adjusted in the dataset
data['DATE'] = pd.to_datetime(data['DATE'], format='%Y%m%d') + pd.offsets.MonthEnd(0)

# Copy the data for top/bottom 1000 AND recursive training later
# data_copy = data.copy()

In [5]:
characteristics = list(set(data.columns).difference({'permno','DATE','SHROUT','mve0','sic2','RET','prc'}))
characteristics.sort()

In [6]:
# BEWARE OF THE DIFFERENCE BETWEEN
# data[ch] = data[ch].groupby(data['DATE']).transform(lambda x: x.fillna(x.median()))
# data[ch] = data.groupby('DATE')[ch].transform(lambda x: x.fillna(x.median()))

for ch in characteristics:
    data[ch] = data.groupby('DATE')[ch].transform(lambda x: x.fillna(x.median()))

for ch in characteristics:
    data[ch] = data[ch].fillna(0)

print(data.columns[data.isnull().sum() != 0])

Index(['prc', 'mve0', 'sic2'], dtype='object')


In [7]:
# Load macroeconomic predictors data. There are eight of them
data_ma = pd.read_excel('PredictorData2022.xlsx')
# The format of dates in macro predictors is YYYYMM instead of YYYYMMDD
data_ma = data_ma[(data_ma['yyyymm'] >= start_date//100) & (data_ma['yyyymm'] <= end_date//100)].reset_index(drop=True)

In [8]:
# Construct predictors
# Index, ntis, tbl, svar are given in the dataset. The remaining four are calculated using other columns
ma_predictors = ['dp_sp','ep_sp','bm_sp','ntis','tbl','tms','dfy','svar']

# data_ma['Index'] is already float64
# data_ma['Index'] = data_ma['Index'].str.replace(',','').astype('float64')
data_ma['dp_sp'] = data_ma['D12'] / data_ma['Index']
data_ma['ep_sp'] = data_ma['E12'] / data_ma['Index']
data_ma.rename({'b/m':'bm_sp'},axis=1,inplace=True)
data_ma['tms'] = data_ma['lty'] - data_ma['tbl']
data_ma['dfy'] = data_ma['BAA'] - data_ma['AAA']

# This removes all the intermediate columns and leaves only the eight predictors, date, and risk-free rate column
data_ma = data_ma[['yyyymm'] + ma_predictors + ['Rfree']]
data_ma['yyyymm'] = pd.to_datetime(data_ma['yyyymm'], format='%Y%m') + pd.offsets.MonthEnd(0)

In [None]:
data_ma

In [None]:
# Picks out the top and bottom 1000 stocks per month by market value
#data_top = data.sort_values('mvel1', ascending=False).groupby('DATE').head(1000).reset_index(drop=True)
#data_top = data_copy.sort_values('mvel1', ascending=False).groupby('DATE').head(1000).reset_index(drop=True)
#data_bot = data_copy.sort_values('mvel1', ascending=False).groupby('DATE').tail(1000).reset_index(drop=True)

In [None]:
# for ch in characteristics:
#     data_top[ch] = data_top.groupby('DATE')[ch].transform(lambda x: x.fillna(x.median()))
#     data_bot[ch] = data_bot.groupby('DATE')[ch].transform(lambda x: x.fillna(x.median()))

# print(data_top.columns[data_top.isnull().sum() != 0])

# for ch in characteristics:
#     data_top[ch] = data_top[ch].fillna(0)
#     data_bot[ch] = data_bot[ch].fillna(0)

# print(data_top.columns[data_top.isnull().sum() != 0])
# print(data_bot.columns[data_bot.isnull().sum() != 0])

In [9]:
# Get dummies for SIC code
def get_sic_dummies(data):
    sic_dummies = pd.get_dummies(data['sic2'].fillna(999).astype(int), prefix='sic').drop('sic_999', axis=1)
    data = pd.concat([data, sic_dummies], axis=1)
    data.drop(['prc', 'SHROUT', 'mve0', 'sic2'], inplace=True, axis=1)
    return data

In [10]:
data = get_sic_dummies(data)
#data_top = get_sic_dummies(data_top)
#data_bot = get_sic_dummies(data_bot)

In [11]:
print(data.shape)
#print(data_top.shape)
#print(data_bot.shape)

(3762139, 171)


In [12]:
# training 1957 - 1974
# validation 1975 - 1986
# test 1987 - 2016
start_val = np.datetime64('1975-01-31')
start_test = np.datetime64('1987-01-31')
end_test = np.datetime64('1987-12-31')

In [None]:
del data_try, data_ma_long

In [None]:
gc.collect()

In [None]:
# Try here first
data_try = data[(data['DATE'] <= np.datetime64('2014-02-28')) & (data['DATE'] >= np.datetime64('2014-01-31'))]
data_ma_long = pd.merge(data_try[['DATE']], data_ma, left_on='DATE', right_on='yyyymm', how='left').reset_index(drop=True)
data_try = data_try.reset_index(drop=True)

In [None]:
data_try

In [None]:
data_try.loc[:, 'RET'] = data_try.loc[:, 'RET'] - data_ma_long.loc[:, 'Rfree']

In [None]:
data_try.to_csv('raw_data.csv')

In [None]:
for fc in characteristics:
    data_try[fc] = data_try.groupby('DATE')[fc].rank()
    data_try[fc] = data_try.groupby('DATE')[fc].transform(lambda x: ((2 * (x - x.min())) / (x.max() - x.min())) - 1
                                                     if x.max() - x.min() != 0 else 0)

In [None]:
interactions = []
for fc in characteristics:
    for mp in ma_predictors:
        data_try[fc + '*' + mp] = data_try.loc[:, fc] * data_ma_long.loc[:, mp]
        interactions.append(fc + '*' + mp)

In [None]:
features = list(set(data_try.columns).difference({'permno', 'DATE', 'RET'}))

In [None]:
for item in interactions:
    data_try[item] = data_try.groupby('DATE')[item].transform(lambda x: (x - x.mean()) / x.std() )

In [None]:
for item in interactions:
    data_try[item] = data_try.groupby('DATE')[item].transform(lambda x: ((2 * (x - x.min())) / (x.max() - x.min())) - 1
                                                     if x.max() - x.min() != 0 else 0)

In [None]:
data_try

In [None]:
data_try.to_csv('transformed_data_scaled.csv')

In [None]:
x_t_sorted = data_try.sort_values(by='mvel1', ascending=False).groupby('DATE').head(1000).reset_index(drop=True)
x_b_sorted = data_try.sort_values(by='mvel1').groupby('DATE').head(1000).reset_index(drop=True)

In [None]:
x_t_sorted

In [None]:
x_b_sorted

In [None]:
x_t_sorted.to_csv('top1k.csv')
x_b_sorted.to_csv('bot1k.csv')

In [None]:
# Rescales numbers to be in [-1, 1]
# def rescale_group(group):
#     if group.max() - group.min() != 0:
#         return 2 * (group - group.min()) / (group.max() - group.min()) - 1
#     else:
#         return 0

In [13]:
def interactions(data, data_ma, characteristics, ma_predictors):

    data_ma_long = pd.merge(data[['DATE']], data_ma, left_on='DATE', right_on='yyyymm', how='left').reset_index(drop=True)
    
    # MUST HAVE THIS LINE OR ELSE CAN'T MULTIPLY!!!
    data = data.reset_index(drop=True)
    
    # Adjust RET to excess return
    data.loc[:, 'RET'] = data.loc[:, 'RET'] - data_ma_long.loc[:, 'Rfree']
    
    # Data PRE-processing: cross-sectional rank transformation
    # data has all columns (including DATE) and all rows
    # See page 23 of https://www.nber.org/system/files/working_papers/w24540/w24540.pdf (another Kelly et al paper)
    for fc in characteristics:
        data[fc] = data.groupby('DATE')[fc].rank()
        data[fc] = data.groupby('DATE')[fc].transform(lambda x: ((2 * (x - x.min())) / (x.max() - x.min())) - 1
                                                     if x.max() - x.min() != 0 else 0)
    
    interactions = []
    for fc in characteristics:
        for mp in ma_predictors:
        # reference for making this less fragmented
        # https://stackoverflow.com/questions/68292862/
        # performancewarning-dataframe-is-highly-fragmented-this-is-usually-the-result-o
            #col_to_add = pd.DataFrame(data.loc[:, fc] * data_ma_long.loc[:, mp], columns=[fc + '*' + mp])
            #data = pd.concat([data, col_to_add], axis=1)
            
            # Maybe faster is
            data[fc + '*' + mp] = data.loc[:, fc] * data_ma_long.loc[:, mp]
            interactions.append(fc + '*' + mp)
    
    # Also scale the interactions so there are no super small or large numbers
    for item in interactions:
        #data[item] = data.groupby('DATE')[item].transform(lambda x: (x - x.mean()) / x.std() if x.max() - x.min() != 0 else 0)
        data[item] = data.groupby('DATE')[item].transform(lambda x: ((2 * (x - x.min())) / (x.max() - x.min())) - 1
                                                     if x.max() - x.min() != 0 else 0)
        
    # 94 (chars) * 8 (macro) + 94 (chars) + 74 (industry) = 920. This is in (fixed) random order
    features = list(set(data.columns).difference({'permno', 'DATE', 'RET'}))
    
#     data[features] = MinMaxScaler((-1,1)).fit_transform(data[features])
#     data[features] = pd.DataFrame(data, columns=features)
    
    # No idea why the following does not work
    #data[characteristics] = data.groupby('DATE')[characteristics].rank()
    #data[characteristics] = data.groupby('DATE')[characteristics].transform(rescale_group)
    
    # Get x and y
    x = data[features]
    y = pd.DataFrame(data['RET'], columns=['RET'])
    
    # Get top 1k
    x_t_sorted = data.sort_values(by='mvel1', ascending=False).groupby('DATE').head(1000).reset_index(drop=True)
    x_t = x_t_sorted[features]
    y_t = pd.DataFrame(x_t_sorted['RET'], columns=['RET'])
    
    # Get bot 1k; without ascending=False, ordered in ascending order
    x_b_sorted = data.sort_values(by='mvel1').groupby('DATE').head(1000).reset_index(drop=True)
    x_b = x_b_sorted[features]
    y_b = pd.DataFrame(x_b_sorted['RET'], columns=['RET'])
    
    print(x.shape, y.shape, x_t.shape, y_t.shape, x_b.shape, y_b.shape)
    return x, y, x_t, y_t, x_b, y_b

In [None]:
gc.collect()

In [14]:
x_train, y_train, x_train_t, y_train_t, x_train_b, y_train_b = interactions(data[data['DATE'] < start_val], 
                                                                            data_ma[data_ma['yyyymm'] < start_val],
                                                                            characteristics, ma_predictors)

x_val, y_val, x_val_t, y_val_t, x_val_b, y_val_b = interactions(data[(data['DATE'] < start_test) & (data['DATE'] >= start_val)],
                                                                data_ma[(data_ma['yyyymm'] < start_test) & (data_ma['yyyymm'] >= start_val)],
                                                                characteristics, ma_predictors)

x_test, y_test, x_test_t, y_test_t, x_test_b, y_test_b = interactions(data[(data['DATE'] >= start_test) & (data['DATE'] <= end_test)],
                                                                      data_ma[(data_ma['yyyymm'] >= start_test) & (data_ma['yyyymm'] <= end_test)],
                                                                      characteristics, ma_predictors)
gc.collect()
    

(479467, 920) (479467, 1) (216000, 920) (216000, 1) (216000, 920) (216000, 1)
(773887, 920) (773887, 1) (144000, 920) (144000, 1) (144000, 920) (144000, 1)
(83323, 920) (83323, 1) (12000, 920) (12000, 1) (12000, 920) (12000, 1)


0

In [15]:
# Check whether there are invalid data points before proceeding to train
print(x_train.columns[x_train.isnull().sum() != 0])
print(x_val.columns[x_val.isnull().sum() != 0])
print(x_test.columns[x_test.isnull().sum() != 0])

Index([], dtype='object')
Index([], dtype='object')
Index([], dtype='object')


In [None]:
# Check whether firm characteristics are all scaled between -1 and 1
# print(np.alltrue(abs(x_train[features])<=1))
# print(np.alltrue(abs(x_val[features])<=1))
# print(np.alltrue(abs(x_test[features])<=1))

In [None]:
x_train.head()

In [None]:
x_val.head()

In [None]:
x_test.head()

In [None]:
# First run gets train, val, test for the first iteration
#x_train, x_val, x_test, y_train, y_val, y_test = train_val_test_split(data)
#x_train_t, x_val_t, x_test_t, y_train_t, y_val_t, y_test_t = train_val_test_split(data_top)
#x_train_b, x_val_b, x_test_b, y_train_b, y_val_b, y_test_b = train_val_test_split(data_bot)

In [None]:
# print(x_train.shape, y_train.shape)
# print(x_val.shape, y_val.shape)
# print(x_test.shape, y_test.shape)
# print(x_train_t.shape, y_train_t.shape)
# print(x_val_t.shape, y_val_t.shape)
# print(x_test_t.shape, y_test_t.shape)
# print(x_train_b.shape, y_train_b.shape)
# print(x_val_b.shape, y_val_b.shape)
# print(x_test_b.shape, y_test_b.shape)

# RF

In [None]:
from sklearn.ensemble import RandomForestRegressor

# rf_regressor = RandomForestRegressor(n_estimators=300, max_depth=6, max_features=100).fit(x_train, y_train)

In [None]:
# print(f'The total training set has MSE {mean_squared_error(y_train, rf_regressor.predict(x_train))}')
# print(f'The total validation set has MSE {mean_squared_error(y_val, rf_regressor.predict(x_val))}')
# print(f'The total test set has MSE {mean_squared_error(y_test, rf_regressor.predict(x_test))}')
# print(f'The total training set has (demeaned) r^2 {r2_score(y_train, rf_regressor.predict(x_train))}')
# print(f'The total training set has r^2 {R_oos(y_train, rf_regressor.predict(x_train))}')
# print(f'The total validation set has r^2 {R_oos(y_val, rf_regressor.predict(x_val))}')
# print(f'The total test set has r^2 {R_oos(y_test, rf_regressor.predict(x_test))}\n')

print(f'The top 1k training set has MSE {mean_squared_error(y_train_t, rf_regressor_t.predict(x_train_t))}')
print(f'The top 1k validation set has MSE {mean_squared_error(y_val_t, rf_regressor_t.predict(x_val_t))}')
print(f'The top 1k test set has MSE {mean_squared_error(y_test_t, rf_regressor_t.predict(x_test_t))}')
print(f'The top 1k training set has (demeaned) r^2 {r2_score(y_train_t, rf_regressor_t.predict(x_train_t))}')
print(f'The top 1k training set has r^2 {R_oos(y_train_t, rf_regressor_t.predict(x_train_t))}')
print(f'The top 1k validation set has r^2 {R_oos(y_val_t, rf_regressor_t.predict(x_val_t))}')
print(f'The top 1k test set has r^2 {R_oos(y_test_t, rf_regressor_t.predict(x_test_t))}\n')

print(f'The bottom 1k training set has MSE {mean_squared_error(y_train_b, rf_regressor_b.predict(x_train_b))}')
print(f'The bottom 1k validation set has MSE {mean_squared_error(y_val_b, rf_regressor_b.predict(x_val_b))}')
print(f'The bottom 1k test set has MSE {mean_squared_error(y_test_b, rf_regressor_b.predict(x_test_b))}')
print(f'The bottom 1k training set has (demeaned) r^2 {r2_score(y_train_b, rf_regressor_b.predict(x_train_b))}')
print(f'The bottom 1k training set has r^2 {R_oos(y_train_b, rf_regressor_b.predict(x_train_b))}')
print(f'The bottom 1k validation set has r^2 {R_oos(y_val_b, rf_regressor_b.predict(x_val_b))}')
print(f'The bottom 1k test set has r^2 {R_oos(y_test_b, rf_regressor_b.predict(x_test_b))}\n')

# Apply NN and OLS_3 Once

In [16]:
# R^2 for OLS_3
def R_oos(y_true, y_pred):
    y_true, y_pred = np.array(y_true).flatten(), np.array(y_pred).flatten()
    return 1 - (np.inner((y_true - y_pred), (y_true - y_pred))) / (np.inner(y_true, y_true))

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error#, r2_score

# OLS with preselected size, bm, and momentum covariates
features_3 = ['mvel1','bm','mom1m']
OLS_3 = LinearRegression().fit(x_train[features_3], y_train)
#OLS_3_t = LinearRegression().fit(x_train_t[features_3], y_train_t)
#OLS_3_b = LinearRegression().fit(x_train_b[features_3], y_train_b)

In [18]:
# Initialize to record all OLS_3 results
OLS_3_train_mse = []
OLS_3_val_mse = []
OLS_3_test_mse = []
# OLS_3_train_R2_demeaned = []
OLS_3_train_R2 = []
OLS_3_val_R2 = []
OLS_3_test_R2 = []

#OLS_3_train_t_mse = []
#OLS_3_val_t_mse = []
OLS_3_test_t_mse = []
# OLS_3_train_t_R2_demeaned = []
# OLS_3_train_t_R2 = []
#OLS_3_val_t_R2 = []
OLS_3_test_t_R2 = []

#OLS_3_train_b_mse = []
#OLS_3_val_b_mse = []
OLS_3_test_b_mse = []
# OLS_3_train_t_R2_demeaned = []
#OLS_3_train_b_R2 = []
#OLS_3_val_b_R2 = []
OLS_3_test_b_R2 = []

In [19]:
OLS_3_train_mse.append(mean_squared_error(y_train, OLS_3.predict(x_train[features_3])))
OLS_3_val_mse.append(mean_squared_error(y_val, OLS_3.predict(x_val[features_3])))
OLS_3_test_mse.append(mean_squared_error(y_test, OLS_3.predict(x_test[features_3])))
# OLS_3_train_R2_demeaned.append(r2_score(y_train, OLS_3.predict(x_train[features_3])))
OLS_3_train_R2.append(R_oos(y_train, OLS_3.predict(x_train[features_3])))
OLS_3_val_R2.append(R_oos(y_val, OLS_3.predict(x_val[features_3])))
OLS_3_test_R2.append(R_oos(y_test, OLS_3.predict(x_test[features_3])))

#OLS_3_train_t_mse.append(mean_squared_error(y_train_t, OLS_3.predict(x_train_t[features_3])))
#OLS_3_val_t_mse.append(mean_squared_error(y_val_t, OLS_3.predict(x_val_t[features_3])))
OLS_3_test_t_mse.append(mean_squared_error(y_test_t, OLS_3.predict(x_test_t[features_3])))
# OLS_3_train_t_R2_demeaned.append(r2_score(y_train_t, OLS_3.predict(x_train_t[features_3])))
#OLS_3_train_t_R2.append(R_oos(y_train_t, OLS_3.predict(x_train_t[features_3])))
#OLS_3_val_t_R2.append(R_oos(y_val_t, OLS_3.predict(x_val_t[features_3])))
OLS_3_test_t_R2.append(R_oos(y_test_t, OLS_3.predict(x_test_t[features_3])))

#OLS_3_val_b_mse.append(mean_squared_error(y_val_b, OLS_3.predict(x_val_b[features_3])))
OLS_3_test_b_mse.append(mean_squared_error(y_test_b, OLS_3.predict(x_test_b[features_3])))
# OLS_3_train_t_R2_demeaned.append(r2_score(y_train_t, OLS_3.predict(x_train_t[features_3])))
#OLS_3_train_t_R2.append(R_oos(y_train_t, OLS_3.predict(x_train_t[features_3])))
#OLS_3_val_b_R2.append(R_oos(y_val_b, OLS_3.predict(x_val_b[features_3])))
OLS_3_test_b_R2.append(R_oos(y_test_b, OLS_3.predict(x_test_b[features_3])))

# OLS_3_train_t_mse.append(mean_squared_error(y_train_t, OLS_3_t.predict(x_train_t[features_3])))
# OLS_3_val_t_mse.append(mean_squared_error(y_val_t, OLS_3_t.predict(x_val_t[features_3])))
# OLS_3_test_t_mse.append(mean_squared_error(y_test_t, OLS_3_t.predict(x_test_t[features_3])))
# OLS_3_train_t_R2_demeaned.append(r2_score(y_train_t, OLS_3_t.predict(x_train_t[features_3])))
# OLS_3_train_t_R2.append(R_oos(y_train_t, OLS_3_t.predict(x_train_t[features_3])))
# OLS_3_val_t_R2.append(R_oos(y_val_t, OLS_3_t.predict(x_val_t[features_3])))
# OLS_3_test_t_R2.append(R_oos(y_test_t, OLS_3_t.predict(x_test_t[features_3])))

print(f'The total training set has MSE {mean_squared_error(y_train, OLS_3.predict(x_train[features_3]))}')
print(f'The total validation set has MSE {mean_squared_error(y_val, OLS_3.predict(x_val[features_3]))}')
print(f'The total test set has MSE {mean_squared_error(y_test, OLS_3.predict(x_test[features_3]))}')
# print(f'The total training set has (demeaned) r^2 {r2_score(y_train, OLS_3.predict(x_train[features_3]))}')
print(f'The total training set has r^2 {R_oos(y_train, OLS_3.predict(x_train[features_3]))}')
print(f'The total validation set has r^2 {R_oos(y_val, OLS_3.predict(x_val[features_3]))}')
print(f'The total test set has r^2 {R_oos(y_test, OLS_3.predict(x_test[features_3]))}\n')

#print(f'The top 1k training set has MSE {mean_squared_error(y_train_t, OLS_3.predict(x_train_t[features_3]))}')
#print(f'The top 1k validation set has MSE {mean_squared_error(y_val_t, OLS_3.predict(x_val_t[features_3]))}')
print(f'The top 1k test set has MSE {mean_squared_error(y_test_t, OLS_3.predict(x_test_t[features_3]))}')
#print(f'The top 1k training set has (demeaned) r^2 {r2_score(y_train_t, OLS_3.predict(x_train_t[features_3]))}')
#print(f'The top 1k training set has r^2 {R_oos(y_train_t, OLS_3.predict(x_train_t[features_3]))}')
#print(f'The top 1k validation set has r^2 {R_oos(y_val_t, OLS_3.predict(x_val_t[features_3]))}')
print(f'The top 1k test set has r^2 {R_oos(y_test_t, OLS_3.predict(x_test_t[features_3]))}\n')

# print(f'The top 1k training set has MSE {mean_squared_error(y_train_t, OLS_3_t.predict(x_train_t[features_3]))}')
# print(f'The top 1k validation set has MSE {mean_squared_error(y_val_t, OLS_3_t.predict(x_val_t[features_3]))}')
# print(f'The top 1k test set has MSE {mean_squared_error(y_test_t, OLS_3_t.predict(x_test_t[features_3]))}')
# print(f'The top 1k training set has (demeaned) r^2 {r2_score(y_train_t, OLS_3_t.predict(x_train_t[features_3]))}')
# print(f'The top 1k training set has r^2 {R_oos(y_train_t, OLS_3_t.predict(x_train_t[features_3]))}')
# print(f'The top 1k validation set has r^2 {R_oos(y_val_t, OLS_3_t.predict(x_val_t[features_3]))}')
# print(f'The top 1k test set has r^2 {R_oos(y_test_t, OLS_3_t.predict(x_test_t[features_3]))}\n')

#print(f'The bottom 1k training set has MSE {mean_squared_error(y_train_b, OLS_3.predict(x_train_b[features_3]))}')
#print(f'The bottom 1k validation set has MSE {mean_squared_error(y_val_b, OLS_3.predict(x_val_b[features_3]))}')
print(f'The bottom 1k test set has MSE {mean_squared_error(y_test_b, OLS_3.predict(x_test_b[features_3]))}')
#print(f'The bottom 1k training set has (demeaned) r^2 {r2_score(y_train_b, OLS_3.predict(x_train_b[features_3]))}')
#print(f'The bottom 1k training set has r^2 {R_oos(y_train_b, OLS_3.predict(x_train_b[features_3]))}')
#print(f'The bottom 1k validation set has r^2 {R_oos(y_val_b, OLS_3.predict(x_val_b[features_3]))}')
print(f'The bottom 1k test set has r^2 {R_oos(y_test_b, OLS_3.predict(x_test_b[features_3]))}')

The total training set has MSE 0.015894677700940815
The total validation set has MSE 0.02657529108184017
The total test set has MSE 0.0362750727239179
The total training set has r^2 0.003989748312870911
The total validation set has r^2 -0.0018476187022247181
The total test set has r^2 0.0020615522965786726

The top 1k test set has MSE 0.015601747395857894
The top 1k test set has r^2 -0.0020383520128559596

The bottom 1k test set has MSE 0.08297433207552728
The bottom 1k test set has r^2 0.0011847044500689075


In [21]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
# Import the class NN from NN_implementations
from ipynb.fs.defs.NN_implementations import NN

In [22]:
model = NN()

In [23]:
# R^2 for NN
def R_squared(y_true, y_pred):
    resid = tf.square(y_true - y_pred)
    denom = tf.square(y_true)
    return 1 - tf.divide(tf.reduce_sum(resid), tf.reduce_sum(denom))

In [24]:
# Record annual results (30, 10) = (year, model)
loss_train = np.zeros((30, 1))
loss_val = np.zeros((30, 1))
loss_test = np.zeros((30, 1))
loss_test_t = np.zeros((30, 1))
loss_test_b = np.zeros((30, 1))
# reg_loss_list = np.zeros((30, 10))
# total_loss_list = np.zeros((30, 10))
R2_train = np.zeros((30, 1))
R2_val = np.zeros((30, 1))
R2_test = np.zeros((30, 1))
R2_test_t = np.zeros((30, 1))
R2_test_b = np.zeros((30, 1))

# Dictionary keys are 'year_model' (e.g. '1975_0' means training till 1975 and validation starts on 1975)
y_train_pred_dict = {}
y_val_pred_dict = {}
# Dictionary keys are 'year_model' (e.g. '1987_0' means the first model testing 1987)
y_pred_dict = {}
y_pred_t_dict = {}
y_pred_b_dict = {}

# Record monthly results (30, 12, 10) = (year, month, model)
loss_testM = np.zeros((30, 12))
loss_test_tM = np.zeros((30, 12))
loss_test_bM = np.zeros((30, 12))
R2_testM = np.zeros((30, 12))
R2_test_tM = np.zeros((30, 12))
R2_test_bM = np.zeros((30, 12))

# Dictionary keys are 'year_month_model' (e.g. '1987_0_3' means the fourth model testing Jan 1987)
y_predM_dict = {}
y_pred_tM_dict = {}
y_pred_bM_dict = {}

# Specify the hyperparameters
L1_val = 0
L2_val = 0.01
dropout = 0
lr_val = 1e-3
bs_val = 256

In [None]:
# Define and compile 10 models for ensemble later
model_dict = {}
for i in range(10):
    seed_val = 120 + i
    model_dict[str(i)] = model.call(
                                model_input = keras.layers.Input(shape=(920, )),
                                n_layers = 3,
                                layers_dim = [128, 32, 8],
                                activation = 'tanh',
                                BatchNormalization = True,
                                L1_lambda = L1_val,
                                L2_lambda = L2_val,
                                dropout_rate = dropout,
                                seed = seed_val)
    model_dict[str(i)].compile(
        loss = keras.losses.MeanSquaredError(),
        optimizer=keras.optimizers.Adam(lr_val),
        metrics=[R_squared]
        )

In [25]:
mse = tf.keras.losses.MeanSquaredError()

In [None]:
# Load saved weights
#for i in range(10):
#    model_dict[str(i)].load_weights('model_NN3_weights_' + str(i) + '.h5')

In [None]:
# model_dict['9'].evaluate(x_test_t, y_test_t)

In [26]:
gc.collect()
keras.backend.clear_session()

In [None]:
# start_val = np.datetime64('1975-01-31')
# start_test = np.datetime64('1987-01-31')
# end_test = np.datetime64('1987-12-31')

In [27]:
# Try it here first
all_months = np.arange('1987-01', '1988-02', dtype='datetime64[M]').astype('datetime64[D]')

In [28]:
all_months

array(['1987-01-01', '1987-02-01', '1987-03-01', '1987-04-01',
       '1987-05-01', '1987-06-01', '1987-07-01', '1987-08-01',
       '1987-09-01', '1987-10-01', '1987-11-01', '1987-12-01',
       '1988-01-01'], dtype='datetime64[D]')

In [29]:
i=0
start_val_year = 1975
start_test_year = 1987

In [62]:
model = NN()

In [63]:
model = model.call(
                                model_input = keras.layers.Input(shape=(920, )),
                                n_layers = 3,
                                layers_dim = [128, 32, 8],
                                activation = 'tanh',
                                BatchNormalization = True,
                                L1_lambda = 0,
                                L2_lambda = 0.001,
                                dropout_rate = 0,
                                seed = 129)
model.compile(
        loss = keras.losses.MeanSquaredError(),
        optimizer=keras.optimizers.Adam(1e-3),
        metrics=[R_squared]
        )

In [64]:
earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1)

In [65]:
history = model.fit(x_train, y_train, validation_data = (x_val, y_val),
                               # change batch_size and epoch
                               batch_size=256, epochs=100
                               # optional early stop
                               ,callbacks=[earlystop]
                               )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 21: early stopping


In [None]:
model_dict[str(i)].save_weights(f'Resluts_log\\seed_129_weights_{start_test_year}_' + str(i) + '.h5')

In [66]:
gc.collect()
keras.backend.clear_session()

In [67]:
y_train_pred_dict[f'{start_val_year}_' + str(i)] = model.predict(x_train, batch_size=x_train.shape[0])
y_val_pred_dict[f'{start_val_year}_' + str(i)] = model.predict(x_val, batch_size=x_val.shape[0])
y_pred_dict[f'{start_test_year}_' + str(i)] = model.predict(x_test, batch_size=x_test.shape[0])
y_pred_t_dict[f'{start_test_year}_' + str(i)] = model.predict(x_test_t, batch_size=x_test_t.shape[0])
y_pred_b_dict[f'{start_test_year}_' + str(i)] = model.predict(x_test_b, batch_size=x_test_b.shape[0])



In [68]:
y_pred_dict[f'{start_test_year}_' + str(i)]

array([[ 0.00477573],
       [ 0.01035129],
       [ 0.01071178],
       ...,
       [ 0.01140213],
       [ 0.00664993],
       [-0.05124054]], dtype=float32)

In [69]:
np.mean(y_pred_dict[f'{start_test_year}_' + str(i)])

-0.0056069465

In [70]:
np.std(y_pred_dict[f'{start_test_year}_' + str(i)])

0.021711268

In [71]:
gc.collect()

757

In [72]:
loss_test[0, i] = mse(y_test, y_pred_dict[f'{start_test_year}_' + str(i)])
R2_test[0, i] = R_squared(y_test, y_pred_dict[f'{start_test_year}_' + str(i)])
loss_test_t[0, i] = mse(y_test_t, y_pred_t_dict[f'{start_test_year}_' + str(i)])
R2_test_t[0, i] = R_squared(y_test_t, y_pred_t_dict[f'{start_test_year}_' + str(i)])
loss_test_b[0, i] = mse(y_test_b, y_pred_b_dict[f'{start_test_year}_' + str(i)])
R2_test_b[0, i] = R_squared(y_test_b, y_pred_b_dict[f'{start_test_year}_' + str(i)]) 

In [73]:
print(loss_test[0, i], loss_test_t[0, i], loss_test_b[0, i], R2_test[0, i], R2_test_t[0, i], R2_test_b[0, i])

0.03622542321681976 0.015725161880254745 0.08329074084758759 0.0034274057159334737 -0.009964680035867923 -0.0026240460291382384


In [74]:
loss_train[0, i] = mse(y_train, y_train_pred_dict[f'{start_val_year}_' + str(i)])
R2_train[0, i] = R_squared(y_train, y_train_pred_dict[f'{start_val_year}_' + str(i)])
loss_val[0, i] = mse(y_val, y_val_pred_dict[f'{start_val_year}_' + str(i)])
R2_val[0, i] = R_squared(y_val, y_val_pred_dict[f'{start_val_year}_' + str(i)])

In [75]:
print(loss_train[0, i], loss_val[0, i], R2_train[0, i], R2_val[0, i])

0.015825476497411728 0.027056017890572548 0.008325995773784123 -0.019970240208824652


In [47]:
j=0

In [51]:
# Get entire, top, and bottom test sets
start_testM = all_months[j]
end_testM = all_months[j+1]
x_testM, y_testM, x_test_tM, y_test_tM, x_test_bM, y_test_bM = interactions(
                                                data[(data['DATE'] >= start_testM) & (data['DATE'] <= end_testM)],
                                                data_ma[(data_ma['yyyymm'] >= start_testM) & (data_ma['yyyymm'] <= end_testM)],
                                                characteristics, ma_predictors)
        
# Find model predictions for entire/top/bottom WITHOUT batch_size
y_predM_dict[f'{start_test_year}_' + str(j) + '_' + str(i)] = model.predict(x_testM, batch_size=x_testM.shape[0])
y_pred_tM_dict[f'{start_test_year}_' + str(j) + '_' + str(i)] = model.predict(x_test_tM, batch_size=x_test_tM.shape[0])
y_pred_bM_dict[f'{start_test_year}_' + str(j) + '_' + str(i)] = model.predict(x_test_bM, batch_size=x_test_bM.shape[0])

(6642, 920) (6642, 1) (1000, 920) (1000, 1) (1000, 920) (1000, 1)


In [None]:
# Rename for easier reference later; RET is excess return
y_predM = pd.DataFrame(y_predM_dict[f'{start_test_year}_' + str(j) + '_' + str(i)], columns=['Pred'])
y_pred_tM = pd.DataFrame(y_pred_tM_dict[f'{start_test_year}_' + str(j) + '_' + str(i)], columns=['Pred'])
y_pred_bM = pd.DataFrame(y_pred_bM_dict[f'{start_test_year}_' + str(j) + '_' + str(i)], columns=['Pred'])
        
# Record loss and R^2
loss_testM[0, j, i] = mse(y_testM, y_predM)
R2_testM[0, j, i] = R_oos(y_testM, y_predM)
loss_test_tM[0, j, i] = mse(y_test_tM, y_pred_tM)
R2_test_tM[0, j, i] = R_oos(y_test_tM, y_pred_tM)
loss_test_bM[0, j, i] = mse(y_test_bM, y_pred_bM)
R2_test_bM[0, j, i] = R_oos(y_test_bM, y_pred_bM) 

In [None]:
print(loss_testM[0, j, i], loss_test_tM[0, j, i], loss_test_bM[0, j, i])
print(R2_testM[0, j, i], R2_test_tM[0, j, i], R2_test_bM[0, j, i])

In [125]:
std_numpy = np.zeros((10, 1))
for i in range(10):
    std_numpy = std_numpy + np.array(std_dict[str(i)]).reshape((10, 1))

std_numpy = std_numpy / 12
std_numpy = np.sqrt(std_numpy)

In [126]:
pred_numpy = np.zeros((10, 1))
for i in range(10):
    pred_numpy = pred_numpy + np.array(pred_dict[str(i)]).reshape((10, 1))

pred_numpy = pred_numpy / 12

In [127]:
avg_numpy = np.zeros((10, 1))
for i in range(10):
    avg_numpy = avg_numpy + np.array(avg_dict[str(i)]).reshape((10, 1))

avg_numpy = avg_numpy / 12

In [128]:
SR_numpy = avg_numpy / std_numpy

In [116]:
SR_numpy.reshape((10, ))

array([-0.02925656, -0.01808599, -0.00764438, -0.00105765, -0.00994002,
       -0.01524728, -0.02040882, -0.01044319, -0.00666282,  0.00113671])

In [138]:
performance = pd.DataFrame(np.hstack([pred_numpy, avg_numpy, std_numpy, SR_numpy]),
                          columns=['Pred', 'Avg', 'Std', 'SR'])

In [139]:
performance

Unnamed: 0,Pred,Avg,Std,SR
0,-0.04364,-0.011401,0.389702,-0.029257
1,-0.021491,-0.007229,0.399688,-0.018086
2,-0.012473,-0.002955,0.386509,-0.007644
3,-0.006684,-0.000404,0.381998,-0.001058
4,-0.002248,-0.003614,0.363599,-0.00994
5,0.001331,-0.005567,0.365085,-0.015247
6,0.004528,-0.006987,0.342347,-0.020409
7,0.007832,-0.003582,0.342961,-0.010443
8,0.011851,-0.002267,0.340215,-0.006663
9,0.017688,0.000342,0.300798,0.001137


In [141]:
performance.to_csv(f'Results_log\\{start_test_year}_perf.csv')

In [92]:
    pred_dict = {}
    std_dict = {}
    avg_dict = {}
    for j in range(12): # j is month
        # Get entire, top, and bottom test sets
        start_testM = all_months[j]
        end_testM = all_months[j+1]
        x_testM, y_testM, x_test_tM, y_test_tM, x_test_bM, y_test_bM = interactions(
                                                data[(data['DATE'] >= start_testM) & (data['DATE'] <= end_testM)],
                                                data_ma[(data_ma['yyyymm'] >= start_testM) & (data_ma['yyyymm'] <= end_testM)],
                                                characteristics, ma_predictors)
        
        # Find model predictions for entire/top/bottom WITHOUT batch_size
        y_predM_dict[f'{start_test_year}_' + str(j) + '_' + str(i)] = model.predict(x_testM, batch_size=x_testM.shape[0])
        y_pred_tM_dict[f'{start_test_year}_' + str(j) + '_' + str(i)] = model.predict(x_test_tM, batch_size=x_test_tM.shape[0])
        y_pred_bM_dict[f'{start_test_year}_' + str(j) + '_' + str(i)] = model.predict(x_test_bM, batch_size=x_test_bM.shape[0])
        
        # Rename for easier reference later; RET is excess return
        y_predM = pd.DataFrame(y_predM_dict[f'{start_test_year}_' + str(j) + '_' + str(i)], columns=['Pred'])
        y_pred_tM = pd.DataFrame(y_pred_tM_dict[f'{start_test_year}_' + str(j) + '_' + str(i)], columns=['Pred'])
        y_pred_bM = pd.DataFrame(y_pred_bM_dict[f'{start_test_year}_' + str(j) + '_' + str(i)], columns=['Pred'])
        
        # Record loss and R^2
        loss_testM[0, j] = mse(y_testM, y_predM)
        R2_testM[0, j] = R_oos(y_testM, y_predM)
        loss_test_tM[0, j] = mse(y_test_tM, y_pred_tM)
        R2_test_tM[0, j] = R_oos(y_test_tM, y_pred_tM)
        loss_test_bM[0, j] = mse(y_test_bM, y_pred_bM)
        R2_test_bM[0, j] = R_oos(y_test_bM, y_pred_bM)  
    
        pred, avg, std, Sharpe = make_decile(x_testM, y_testM, y_predM)
        pred_dict[str(j)] = pred
        std_dict[str(j)] = std
        avg_dict[str(j)] = avg
        performace = pd.DataFrame(data={'Pred': pred, 'Avg': avg, 'Std': std, 'SR': Sharpe})
        performace.to_csv(f'Results_log\\{start_test_year}_perf_{j}.csv')

(6642, 920) (6642, 1) (1000, 920) (1000, 1) (1000, 920) (1000, 1)
(6673, 920) (6673, 1) (1000, 920) (1000, 1) (1000, 920) (1000, 1)
(6740, 920) (6740, 1) (1000, 920) (1000, 1) (1000, 920) (1000, 1)
(6792, 920) (6792, 1) (1000, 920) (1000, 1) (1000, 920) (1000, 1)
(6845, 920) (6845, 1) (1000, 920) (1000, 1) (1000, 920) (1000, 1)
(6919, 920) (6919, 1) (1000, 920) (1000, 1) (1000, 920) (1000, 1)
(6996, 920) (6996, 1) (1000, 920) (1000, 1) (1000, 920) (1000, 1)
(7081, 920) (7081, 1) (1000, 920) (1000, 1) (1000, 920) (1000, 1)
(7130, 920) (7130, 1) (1000, 920) (1000, 1) (1000, 920) (1000, 1)
(7169, 920) (7169, 1) (1000, 920) (1000, 1) (1000, 920) (1000, 1)
(7172, 920) (7172, 1) (1000, 920) (1000, 1) (1000, 920) (1000, 1)
(7164, 920) (7164, 1) (1000, 920) (1000, 1) (1000, 920) (1000, 1)


In [78]:
print(loss_train[0, 0], loss_val[0, 0], loss_test[0, 0], loss_test_t[0, 0], loss_test_b[0, 0])
print(R2_train[0, 0], R2_val[0, 0], R2_test[0, 0], R2_test_t[0, 0], R2_test_b[0, 0])
print(loss_testM[0, :], loss_test_tM[0, :], loss_test_bM[0, :])
print(R2_testM[0, :], R2_test_tM[0, :], R2_test_bM[0, :])

0.015825476497411728 0.027056017890572548 0.03622542321681976 0.015725161880254745 0.08329074084758759
0.008325995773784123 -0.019970240208824652 0.0034274057159334737 -0.009964680035867923 -0.0026240460291382384
[0.05240846 0.03847449 0.03602919 0.02306702 0.02297267 0.02324396
 0.02888217 0.02019778 0.01855616 0.09503614 0.02969969 0.04526619] [0.02269138 0.00958815 0.00834866 0.00797084 0.00595919 0.00710518
 0.01040277 0.00598117 0.00728697 0.07478819 0.01262027 0.01595914] [0.14395161 0.09555566 0.10155001 0.06237697 0.06607016 0.06370836
 0.07795125 0.04889692 0.04341639 0.09686999 0.06554553 0.133596  ]
[-0.0089486  -0.01770261 -0.01146384 -0.00939329 -0.02125028 -0.02820284
 -0.03309397 -0.01553169  0.00114102  0.04251524  0.02908373  0.00522635] [ 0.0688193   0.04831142  0.00735601 -0.08368115 -0.09346453 -0.00271822
 -0.06312208 -0.02203449 -0.06090096 -0.02627831 -0.03258704  0.03242939] [-0.01938341 -0.01751421 -0.01074385 -0.01530814 -0.0109311  -0.02478008
 -0.01619507 -0

In [76]:
def make_decile(x_testM, y_testM, y_predM):
    x_testM = pd.concat([x_testM, y_testM, y_predM], axis=1)
    x_testM_grouped = x_testM.groupby(pd.qcut(x_testM.Pred, 10, labels=False))
            
    # https://stackoverflow.com/questions/58040767/group-pandas-dataframe-by-quantile-of-single-column/58041129#58041129
    # decile['0'] is lowest decile, decile['9'] highest
    decile = {}
    for key, group in x_testM_grouped:
        decile[str(key)] = pd.DataFrame(group).sort_values(by=['Pred'], ascending=False, ignore_index=True)
            
    pred = []
    std = []
    avg = []
    Sharpe = []
    for i in range(10): # 10 deciles
        pred.append(np.mean(decile[str(i)]['Pred']))
        avg.append(np.mean(decile[str(i)]['RET']))
        std.append(np.std(decile[str(i)]['RET']))
        Sharpe.append(avg[-1] / std[-1])
            
    return pred, avg, std, Sharpe

In [None]:
earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, verbose=1)

# Use start years for naming keys in prediction dictionaries
start_val_year = 1975
start_test_year = 1987
# Construct monthly start and end dates for test set
start_test_ym = '1987-01'
end_test_ym = '1988-02'
all_months = np.arange(start_test_ym, end_test_ym, dtype='datetime64[M]').astype('datetime64[D]')

for i in range(1): # i is model
    # Housekeeping
    gc.collect()
    keras.backend.clear_session()
    
    # Train the model
    history = model_dict[str(i)].fit(x_train, y_train, validation_data = (x_val, y_val),
                               # change batch_size and epoch
                               batch_size=bs_val, epochs=100
                               # optional early stop
                               ,callbacks=[earlystop]
                               )
    
    # Save weights of all models for all years
    # e.g. 'weights_1987_0' means the first model weights when the test set starts on 1987, so training ends on 1987-13=1974
    model_dict[str(i)].save_weights(f'Results_log\\seed129_{start_test_year}_' + str(i) + '.h5')
    
    # Find model predictions for entire/top/bottom WITHOUT batch_size
    y_train_pred_dict[f'{start_val_year}_' + str(i)] = model_dict[str(i)].predict(x_train, batch_size=x_train.shape[0])
    y_val_pred_dict[f'{start_val_year}_' + str(i)] = model_dict[str(i)].predict(x_val, batch_size=x_val.shape[0])
    y_pred_dict[f'{start_test_year}_' + str(i)] = model_dict[str(i)].predict(x_test, batch_size=x_test.shape[0])
    y_pred_t_dict[f'{start_test_year}_' + str(i)] = model_dict[str(i)].predict(x_test_t, batch_size=x_test_t.shape[0])
    y_pred_b_dict[f'{start_test_year}_' + str(i)] = model_dict[str(i)].predict(x_test_b, batch_size=x_test_b.shape[0])
    predY, avgY, stdY, SharpeY = make_decile(x_test, y_test, pd.DataFrame(y_pred_dict[f'{start_test_year}_' + str(i)]), columns=['Pred'])
    performace = pd.DataFrame(data={'Pred': pred, 'Avg': avg, 'Std': std, 'SR': Sharpe})
    performace.to_csv(f'Results_log\\{start_test_year}_perf.csv')
        
    # Record loss and R^2; train and validation
    loss_train[0, i] = mse(y_train, y_train_pred_dict[f'{start_val_year}_' + str(i)])
    R2_train[0, i] = R_squared(y_train, y_train_pred_dict[f'{start_val_year}_' + str(i)])
    loss_val[0, i] = mse(y_val, y_val_pred_dict[f'{start_val_year}_' + str(i)])
    R2_val[0, i] = R_squared(y_val, y_val_pred_dict[f'{start_val_year}_' + str(i)])
    # test, top/bottom test
    loss_test[0, i] = mse(y_test, y_pred_dict[f'{start_test_year}_' + str(i)])
    R2_test[0, i] = R_squared(y_test, y_pred_dict[f'{start_test_year}_' + str(i)])
    loss_test_t[0, i] = mse(y_test_t, y_pred_t_dict[f'{start_test_year}_' + str(i)])
    R2_test_t[0, i] = R_squared(y_test_t, y_pred_t_dict[f'{start_test_year}_' + str(i)])
    loss_test_b[0, i] = mse(y_test_b, y_pred_b_dict[f'{start_test_year}_' + str(i)])
    R2_test_b[0, i] = R_squared(y_test_b, y_pred_b_dict[f'{start_test_year}_' + str(i)]) 
    
    decile = {}
    std = {}
    avg = {}
    Sharpe = {}
    for j in range(12): # j is month
        # Get entire, top, and bottom test sets
        start_testM = all_months[j]
        end_testM = all_months[j+1]
        x_testM, y_testM, x_test_tM, y_test_tM, x_test_bM, y_test_bM = interactions(
                                                data[(data['DATE'] >= start_testM) & (data['DATE'] <= end_testM)],
                                                data_ma[(data_ma['yyyymm'] >= start_testM) & (data_ma['yyyymm'] <= end_testM)],
                                                characteristics, ma_predictors)
        
        # Find model predictions for entire/top/bottom WITHOUT batch_size
        y_predM_dict[f'{start_test_year}_' + str(j) + '_' + str(i)] = model_dict[str(i)].predict(x_testM, batch_size=x_testM.shape[0])
        y_pred_tM_dict[f'{start_test_year}_' + str(j) + '_' + str(i)] = model_dict[str(i)].predict(x_test_tM, batch_size=x_test_tM.shape[0])
        y_pred_bM_dict[f'{start_test_year}_' + str(j) + '_' + str(i)] = model_dict[str(i)].predict(x_test_bM, batch_size=x_test_bM.shape[0])
        
        # Rename for easier reference later; RET is excess return
        y_predM = pd.DataFrame(y_predM_dict[f'{start_test_year}_' + str(j) + '_' + str(i)], columns=['Pred'])
        y_pred_tM = pd.DataFrame(y_pred_tM_dict[f'{start_test_year}_' + str(j) + '_' + str(i)], columns=['Pred'])
        y_pred_bM = pd.DataFrame(y_pred_bM_dict[f'{start_test_year}_' + str(j) + '_' + str(i)], columns=['Pred'])
        
        # Record loss and R^2
        loss_testM[0, j, i] = mse(y_testM, y_predM)
        R2_testM[0, j, i] = R_squared(y_testM, y_predM)
        loss_test_tM[0, j, i] = mse(y_test_tM, y_pred_tM)
        R2_test_tM[0, j, i] = R_squared(y_test_tM, y_pred_tM)
        loss_test_bM[0, j, i] = mse(y_test_bM, y_pred_bM)
        R2_test_bM[0, j, i] = R_squared(y_test_bM, y_pred_bM)  
    
        pred, std, avg, Sharpe = make_decile(x_testM, y_testM, y_predM)
        performace = pd.DataFrame(data={'Pred': pred, 'Avg': avg, 'Std': std, 'SR': Sharpe})
        performace.to_csv(f'Results_log\\{start_test_year}_perf_{j}.csv')

print(loss_train[0, 0], loss_val[0, 0], loss_test[0, 0], loss_test_t[0, 0], loss_test_b[0, 0])
print(R2_train[0, 0], R2_val[0, 0], R2_test[0, 0], R2_test_t[0, 0], R2_test_b[0, 0])
print(loss_testM[0, :], loss_test_tM[0, :], loss_test_bM[0, :])
print(R2_testM[0, :], R2_test_tM[0, :], R2_test_bM[0, :])


# Recursively do OLS_3 and refit NN models

In [None]:
def get_datetime64(date_digits):
    date_str = str(date_digits)
    date_str = date_str[0:4] + '-' + date_str[4:6] + '-' + date_str[6:8]
    return date_str

In [None]:
# start_val = np.datetime64('1975-01-31')
# start_test = np.datetime64('1987-01-31')
# end_test = np.datetime64('1987-12-31')
start_test_year = 1987
end_test_year = 1988
start_val_numer = 19750131
start_test_numer = 19870131
end_test_numer = 19871231

In [None]:
# Record the sizes of training, validation, test sets
train_shape = [0]*30
val_shape = [0]*30
test_shape = [0]*30
train_shape[0] = 479467
val_shape[0] = 773887
test_shape[0] = 83323

In [None]:
# There are in total 30 OOS years to test (1987-2016)
# Train every year, but test every month and every year
for year in range(1, 30):
    
    gc.collect()
    keras.backend.clear_session()
    
    # Set the correct dates
    start_val_prev_numer = start_val_numer
    start_test_prev_numer = start_test_numer
    end_test_prev_numer = end_test_numer
    start_val_numer = start_val_numer + 10000
    start_test_numer = start_test_numer + 10000
    end_test_numer = end_test_numer + 10000
    start_val_prev = get_datetime64(start_val_prev_numer)
    start_test_prev = get_datetime64(start_test_prev_numer)
    end_test_prev = get_datetime64(end_test_prev_numer)
    start_val = get_datetime64(start_val_numer)
    start_test = get_datetime64(start_test_numer)
    end_test = get_datetime64(end_test_numer)
    print(start_val_prev, start_test_prev, end_test_prev, start_val, start_test, end_test)
    
    # Add one more year to training
    x_train_add, y_train_add, _, _, _, _ = interactions(data[(data['DATE'] < start_val) & (data['DATE'] >= start_val_prev)],
                                           data_ma[(data_ma['yyyymm'] < start_val) & (data_ma['yyyymm'] >= start_val_prev)],
                                           characteristics, ma_predictors)
    x_train = pd.concat([x_train, x_train_add], ignore_index=True)
    y_train = pd.concat([y_train, y_train_add], ignore_index=True)
    train_shape[year] = x_train.shape[0]
    
    # Since x,y_val has no more date inside, we will just get them again
    x_val, y_val, _, _, _, _ = interactions(data[(data['DATE'] < start_test) & (data['DATE'] >= start_val)],
                                                                    data_ma[(data_ma['yyyymm'] < start_test) & (data_ma['yyyymm'] >= start_val)],
                                                                    characteristics, ma_predictors)
    val_shape[year] = x_val.shape[0]

    # Change the test set to the next year
    x_test, y_test, x_test_t, y_test_t, x_test_b, y_test_b = interactions(data[(data['DATE'] >= start_test) & (data['DATE'] <= end_test)],
                                                                          data_ma[(data_ma['yyyymm'] >= start_test) & (data_ma['yyyymm'] <= end_test)],
                                                                          characteristics, ma_predictors)
    test_shape[year] = x_test.shape[0]
    
    # Do OLS_3 first
    # Replace the OLS model every iteration (could put in dictionary if want to save all 30 of them)
    OLS_3 = LinearRegression().fit(x_train[features_3], y_train)
    
    OLS_3_train_mse.append(mean_squared_error(y_train, OLS_3.predict(x_train[features_3])))
    OLS_3_val_mse.append(mean_squared_error(y_val, OLS_3.predict(x_val[features_3])))
    OLS_3_test_mse.append(mean_squared_error(y_test, OLS_3.predict(x_test[features_3])))
    OLS_3_train_R2.append(R_oos(y_train, OLS_3.predict(x_train[features_3])))
    OLS_3_val_R2.append(R_oos(y_val, OLS_3.predict(x_val[features_3])))
    OLS_3_test_R2.append(R_oos(y_test, OLS_3.predict(x_test[features_3])))
    
    #OLS_3_val_t_mse.append(mean_squared_error(y_val_t, OLS_3.predict(x_val_t[features_3])))
    OLS_3_test_t_mse.append(mean_squared_error(y_test_t, OLS_3.predict(x_test_t[features_3])))
    #OLS_3_val_b_mse.append(mean_squared_error(y_val_b, OLS_3.predict(x_val_b[features_3])))
    OLS_3_test_b_mse.append(mean_squared_error(y_test_b, OLS_3.predict(x_test_b[features_3])))
    #OLS_3_train_t_R2_demeaned.append(r2_score(y_train_t, OLS_3.predict(x_train_t[features_3])))
    #OLS_3_train_t_R2.append(R_oos(y_train_t, OLS_3.predict(x_train_t[features_3])))
    #OLS_3_val_t_R2.append(R_oos(y_val_t, OLS_3.predict(x_val_t[features_3])))
    OLS_3_test_t_R2.append(R_oos(y_test_t, OLS_3.predict(x_test_t[features_3])))
    #OLS_3_val_b_R2.append(R_oos(y_val_b, OLS_3.predict(x_val_b[features_3])))
    OLS_3_test_b_R2.append(R_oos(y_test_b, OLS_3.predict(x_test_b[features_3])))
    print(OLS_3_train_R2[-1], OLS_3_val_R2[-1],  OLS_3_val_t_R2[-1], OLS_3_val_b_R2[-1], OLS_3_test_R2[-1], OLS_3_test_t_R2[-1], 
         OLS_3_test_b_R2[-1])
    
    for i in range(10):
        # Housekeeping
        keras.backend.clear_session()
        gc.collect()
        
        # This refits the model used before
        history = model_dict[str(i)].fit(x_train, y_train, validation_data = (x_val, y_val),
                               # change batch_size and epoch
                               batch_size=bs_val, epochs=100
                               # optional early stop
                               ,callbacks=[earlystop]
                               )
    
        loss_val_t[year, i], R2_val_t[year, i] = model_dict[str(i)].evaluate(x_val_t, y_val_t, batch_size=bs_val)
        loss_val_b[year, i], R2_val_b[year, i] = model_dict[str(i)].evaluate(x_val_b, y_val_b, batch_size=bs_val)
        loss_test[year, i], R2_test[year, i] = model_dict[str(i)].evaluate(x_test, y_test, batch_size=bs_val)
        loss_test_t[year, i], R2_test_t[year, i] = model_dict[str(i)].evaluate(x_test_t, y_test_t, batch_size=bs_val)
        loss_test_b[year, i], R2_test_b[year, i] = model_dict[str(i)].evaluate(x_test_b, y_test_b, batch_size=bs_val)

    #print(loss_val_t[year, :], loss_val_b[year, :], loss_test[year, :], loss_test_t[year, :], loss_test_b[year, :])
    print(R2_val_t[year, :], R2_val_b[year, :], R2_test[year, :], R2_test_t[year, :], R2_test_b[year, :])    
        
        

In [None]:
# Some references for model.evaluate()
# https://stackoverflow.com/questions/50723287/meaning-of-batch-size-in-model-evaluate (floating point error)
# https://stackoverflow.com/questions/49359489/how-are-metrics-computed-in-keras (val metric)

print(OLS_3_train_R2)
print(OLS_3_val_R2)
print(OLS_3_val_t_R2)
print(OLS_3_val_b_R2)
print(OLS_3_test_R2)
print(OLS_3_test_t_R2)
print(OLS_3_test_b_R2)

In [None]:
print(R2_val_t[0:12, :])
print(R2_val_b[0:12, :])
print(R2_test[0:12, :])
print(R2_test_t[0:12, :])
print(R2_test_b[0:12, :])

In [None]:
for i in range(10):
    model_dict[str(i)].save_weights(f'model_weights_{i}.h5')

In [None]:
model_dict['9'].evaluate(x_val_t, y_val_t, batch_size=10000)

In [None]:
np.mean(R2_test[0:12,:], axis=1) - np.array(OLS_3_test_R2)

In [None]:
np.mean(R2_test_t[0:12,:], axis=1) - np.array(OLS_3_test_t_R2)

In [None]:
np.mean(R2_test_b[0:12,:], axis=1) - np.array(OLS_3_test_b_R2)

In [None]:
gc.collect()

# Other NN models

In [None]:
# Ensemble method NN3
loss_list_3 = []
R2_list_3 = []
for i in range(10):
    seed_val = 120 + i
    model_NN3_dft = model.call(
                    model_input = keras.layers.Input(shape=(920, )),
                    n_layers = 3,
                    activation = 'relu',
                    BatchNormalization = True,
                    first_layer_dim = 32,
                    L1_lambda = 0.5,
                    seed = seed_val)
    
    model_NN3_dft.compile(
    loss = keras.losses.MeanSquaredError(),
    # Specify the learning rate
    optimizer=keras.optimizers.Adam(2e-4),
    metrics=[R_squared]
    )

    # optional early stop
    earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1)

    history = model_NN3_dft.fit(x_train, y_train, validation_data = (x_val, y_val),
                           # change batch_size and epoch
                           batch_size=512, epochs=100
                           # optional early stop
                           ,callbacks=[earlystop]
                           )
    loss_test, R2_test = model_NN3_dft.evaluate(x_test, y_test)
    loss_list_3.append(loss_test)
    R2_list_3.append(R2_test)
    
    gc.collect()

In [None]:
print(loss_list_3)
print(R2_list_3)
print(np.mean(loss_list_3))
print(np.mean(R2_list_3))

In [None]:
# Ensemble method NN2
loss_list_2 = []
R2_list_2 = []
for i in range(10):
    seed_val = 120 + i
    model_NN2_dft = model.call(
                    model_input = keras.layers.Input(shape=(920, )),
                    n_layers = 2,
                    activation = 'relu',
                    BatchNormalization = True,
                    first_layer_dim = 32,
                    L1_lambda = 0.5,
                    seed = seed_val)
    
    model_NN2_dft.compile(
    loss = keras.losses.MeanSquaredError(),
    # Specify the learning rate
    optimizer=keras.optimizers.Adam(2e-4),
    metrics=[R_squared]
    )

    # optional early stop
    earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1)

    history = model_NN2_dft.fit(x_train, y_train, validation_data = (x_val, y_val),
                           # change batch_size and epoch
                           batch_size=512, epochs=100
                           # optional early stop
                           ,callbacks=[earlystop]
                           )
    loss_test, R2_test = model_NN2_dft.evaluate(x_test, y_test)
    loss_list_2.append(loss_test)
    R2_list_2.append(R2_test)
    
    gc.collect()

In [None]:
print(loss_list_2)
print(R2_list_2)
print(np.mean(loss_list_2))
print(np.mean(R2_list_2))

In [None]:
# Ensemble method NN5
loss_list_5 = []
R2_list_5 = []
for i in range(10):
    seed_val = 120 + i
    model_NN5_dft = model.call(
                    model_input = keras.layers.Input(shape=(920, )),
                    n_layers = 5,
                    activation = 'relu',
                    BatchNormalization = True,
                    first_layer_dim = 32,
                    L1_lambda = 0.5,
                    seed = seed_val)
    
    model_NN5_dft.compile(
    loss = keras.losses.MeanSquaredError(),
    # Specify the learning rate
    optimizer=keras.optimizers.Adam(2e-4),
    metrics=[R_squared]
    )

    # optional early stop
    earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1)

    history = model_NN5_dft.fit(x_train, y_train, validation_data = (x_val, y_val),
                           # change batch_size and epoch
                           batch_size=512, epochs=100
                           # optional early stop
                           ,callbacks=[earlystop]
                           )
    loss_test, R2_test = model_NN5_dft.evaluate(x_test, y_test)
    loss_list_5.append(loss_test)
    R2_list_5.append(R2_test)
    
    gc.collect()

In [None]:
print(loss_list_5)
print(R2_list_5)
print(np.mean(loss_list_5))
print(np.mean(R2_list_5))

In [None]:
gc.collect()

In [None]:
model_NN4 = model.call(
                    # change the shape to match input shape
                    model_input = keras.layers.Input(shape=(920, )),
                    # number of hidden layers 
                    n_layers = 4,
                    # Applied to all layers. Common activations are relu, softmax, sigmoid, tanh
                    activation = 'relu',
                    # True or False for batch normalization. BatchNorm is applied after every layer
                    BatchNormalization = True,
                    # number of neurons in the first layer. Assume every subsequent layer has half as many neurons as the previous layer
                    first_layer_dim = 32,
                    # L2_lambda is the parameter for L2_regularization. Set to 0 for no regularization, default is 0.01
                    L2_lambda = 1e-4)
model_NN4.summary()

In [None]:
model_NN4.compile(
    loss = keras.losses.MeanSquaredError(),
    # Specify the learning rate
    optimizer=keras.optimizers.Adam(0.01),
    metrics=[R_squared]
)

# optional early stop
#earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1)

history = model_NN4.fit(x_train, y_train, validation_data = (x_val, y_val),
                           # change batch_size and epoch
                           batch_size=10000, epochs=100
                           # optional early stop
                           #,callbacks=[earlystop]
                           )

In [None]:
model_NN4.evaluate(x_test, y_test)