# OLS-3
OLS with only three covariates (size, book-to-market, momentum).
### Load Modules

In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

### Load Dataset

In [2]:
# Load monthly firm characteristics raw data
df = pd.read_parquet('C:/Users/rafae/Documents/HSG/Master Thesis/Data/Final/data07_model_input.parquet')
df = df.sort_values(by=['YM', 'permno'])
df = df.set_index(['year', 'YM', 'permno'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gvkey,reprisk_id,prc,vol,mve_m,absacc,acc,aeavol,age,agr,...,sic2_73,sic2_75,sic2_78,sic2_79,sic2_80,sic2_81,sic2_82,sic2_83,sic2_87,sic2_99
year,YM,permno,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2007,2007-01,10025,11903,37172,45.320000,8086.0,3.700557e+05,0.698728,-0.745547,-0.646819,0.457506,-0.979644,...,0,0,0,0,0,0,0,0,0,0
2007,2007-01,10026,12825,12684,39.689999,7613.0,7.653725e+05,0.577608,-0.635623,-0.393384,0.457506,0.118575,...,0,0,0,0,0,0,0,0,0,0
2007,2007-01,10042,12139,4832,0.720000,26008.0,3.598898e+04,0.990840,-0.989822,-0.894148,0.457506,-0.989822,...,0,0,0,0,0,0,0,0,0,0
2007,2007-01,10078,12136,1719,6.130000,11333293.0,2.390900e+07,0.654962,-0.711959,-0.128753,0.905344,-0.147074,...,0,0,0,0,0,0,0,0,0,0
2007,2007-01,10104,12142,4413,16.430000,7234361.0,8.892640e+07,-0.014758,-0.107379,0.780153,0.905344,0.770992,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021,2021-12,93304,184167,91339,36.750000,183303.0,1.695898e+06,-0.855522,0.706269,-0.871045,-0.500896,0.613134,...,0,0,0,0,0,0,0,0,0,0
2021,2021-12,93373,184323,74074,3.020000,528712.0,2.065325e+05,-0.213134,0.143881,0.663284,-0.500896,-0.875821,...,0,0,0,0,0,0,0,0,0,0
2021,2021-12,93374,184899,64442,74.510002,159495.0,8.587073e+06,-0.514030,0.423284,-0.223881,-0.500896,0.328955,...,0,0,0,0,0,0,0,0,0,0
2021,2021-12,93423,10567,22547,39.490002,254917.0,3.661156e+06,0.242985,-0.303881,0.560597,0.143881,-0.584478,...,0,0,0,1,0,0,0,0,0,0


In [3]:
# Select only relevant columns for X and Y
ols_3_components = ['mve', 'bm', 'mom12m']

# X
X = df[ols_3_components]

# Y
Y = df[['ret_ex']]

X.shape, Y.shape

((338288, 3), (338288, 1))

### Hyperparameter Optimization: 4-Fold CV (12y/4 = 3y) and 3y Test

In [4]:
# Training (12y - 80%) and Test set (3y - 20%)
X_trai = X.loc['2007':'2018']
Y_trai = Y.loc['2007':'2018']

X_test = X.loc['2019':'2021']
Y_test = Y.loc['2019':'2021']

In [5]:
# 4-Fold cross validation (9y training and 3y validation)
K_FOLDs = 4
YEARS = list(X_trai.index.unique(level='year').astype('str')) 
TOT = len(YEARS)
TRA = int(TOT* (K_FOLDs-1) / K_FOLDs)
OFF = TOT - TRA

for FOLD in range(K_FOLDs):
    VALI = YEARS[(FOLD*OFF):((FOLD+1)*OFF)]
    TRAI = [x for x in YEARS if x not in VALI]
    print(VALI, TRAI)

['2007', '2008', '2009'] ['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']
['2010', '2011', '2012'] ['2007', '2008', '2009', '2013', '2014', '2015', '2016', '2017', '2018']
['2013', '2014', '2015'] ['2007', '2008', '2009', '2010', '2011', '2012', '2016', '2017', '2018']
['2016', '2017', '2018'] ['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']


In [6]:
# Model
model_name = 'OLS-3'

In [7]:
# Validation and Testing architecture
Y_val_preds = Y_trai.copy()
Y_test_preds = Y_test.copy()
results = []

for FOLD in range(K_FOLDs):
    VALI = YEARS[(FOLD*OFF):((FOLD+1)*OFF)]
    TRAI = [x for x in YEARS if x not in VALI]
    
    # Reset seeds
    np.random.seed(2022)
    random.seed(2022)

    # Create model
    ols_3 = LinearRegression(n_jobs=-1)

    # Fit the model
    ols_3.fit(X_trai.loc[TRAI], Y_trai.loc[TRAI]) 

    # Calculate validation predictions
    val_preds = ols_3.predict(X_trai.loc[VALI])
    Y_val_preds.loc[VALI, '%s'%(model_name)] = val_preds
    val_loss = mean_squared_error(Y_trai.loc[VALI], val_preds)
                        
    # Calculate predictions for test data, if FOLD = 0
    if FOLD==0:
        Y_test_preds.loc[['2019','2020', '2021'], '%s'%(model_name)] = ols_3.predict(X_test)
            
    # Append results
    results.append({
        'fold'    :FOLD,
        'val_loss':val_loss
    })

In [8]:
# Save Y_val_preds
Y_val_preds.to_csv(r'%s/%s_val_preds.csv'%(model_name, model_name))
Y_val_preds

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ret_ex,OLS-3
year,YM,permno,Unnamed: 3_level_1,Unnamed: 4_level_1
2007,2007-01,10025,-0.031894,0.013382
2007,2007-01,10026,-0.042317,0.012179
2007,2007-01,10042,-0.125751,0.004057
2007,2007-01,10078,-0.080607,0.014305
2007,2007-01,10104,-0.046341,0.016643
...,...,...,...,...
2018,2018-12,93420,0.086508,0.003553
2018,2018-12,93422,0.466817,0.003315
2018,2018-12,93423,0.105036,0.006856
2018,2018-12,93429,-0.048712,0.005958


In [9]:
# Save Y_test_preds
Y_test_preds.to_csv(r'%s/%s_test_preds.csv'%(model_name, model_name))
Y_test_preds

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ret_ex,OLS-3
year,YM,permno,Unnamed: 3_level_1,Unnamed: 4_level_1
2019,2019-01,10026,0.004225,0.012895
2019,2019-01,10104,0.036026,0.013704
2019,2019-01,10107,0.075381,0.016454
2019,2019-01,10138,0.072777,0.012548
2019,2019-01,10145,0.076596,0.013794
...,...,...,...,...
2021,2021-12,93304,-0.096386,0.006868
2021,2021-12,93373,-0.019481,0.012690
2021,2021-12,93374,-0.047552,0.013211
2021,2021-12,93423,-0.072569,0.009466


In [10]:
# Result overview
table = pd.DataFrame(results)
table.to_csv(r'%s/%s_results.csv'%(model_name, model_name))

np.sqrt(table[['val_loss']]) * 100

Unnamed: 0,val_loss
0,16.660391
1,11.85842
2,10.772236
3,11.655554


## Out-of-Sample: Save Predictions

In [11]:
# Model
model_name = 'OLS-3'

# Get predictions
test = pd.read_csv(r'%s/%s_test_preds.csv'%(model_name, model_name), index_col=['year', 'YM', 'permno'])
test_ret = test.copy()
test_ret['y_pred'] = test[['%s'%(model_name)]]

# Save predictions
test_ret = test_ret.reset_index()
test_ret = test_ret[['YM', 'permno', 'y_pred']]
test_ret.to_csv(r'results/{}_predictions.csv'.format(model_name), index=False)