In [1]:
import os
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
from pysr import PySRRegressor

import warnings
warnings.filterwarnings("ignore")

start_date = "2010-01-01"
end_date = "2020-01-01"

In [2]:
# Read the macro factors from paper "A Comprehensive Look at The Empirical Performance of Equity Premium Prediction"
# Source: https://sites.google.com/view/agoyal145
# Source: https://docs.google.com/spreadsheets/d/1g4LOaRj4TvwJr9RIaA_nwrXXWTOy46bP/edit#gid=2070662242

factors_annual = pd.read_excel("data_clean/macro_factors.xlsx", sheet_name = "Annual", index_col = 0)
factors_monthly = pd.read_excel("data_clean/macro_factors.xlsx", sheet_name = "Monthly", index_col = 0)

factors_annual.index = pd.to_datetime(factors_annual.index, format='%Y')
factors_monthly.index = pd.to_datetime(factors_monthly.index, format='%Y%m')

factors_annual = factors_annual.iloc[(factors_annual.index >= start_date) & (factors_annual.index < end_date), 3:].dropna(axis=1)
factors_monthly = factors_monthly.iloc[(factors_monthly.index >= start_date) & (factors_monthly.index < end_date), 3:].dropna(axis=1)

In [20]:
# Use Fama-French 5 factors
# Source: https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html

ff5_annual = pd.read_csv("data_clean/FF5_annual.csv", index_col=0).iloc[:,:-1] / 100
ff5_annual.index = pd.to_datetime(ff5_annual.index, format='%Y')
ff5_annual = ff5_annual[(ff5_annual.index >= start_date) & (ff5_annual.index < end_date)]

In [3]:
# Read the portfolio weights long-format produced by main_1_get_weight.ipynb
file_names = os.listdir('portfolio_weights')
file_names = [file for file in file_names if os.path.isfile(os.path.join('portfolio_weights', file))]
file_names_annual = [string for string in file_names if "annual" in string]
file_names_monthly = [string for string in file_names if "month" in string]

names_annual = [file_names_annual.replace('weight_annual_', '').replace('.csv', '') for file_names_annual in file_names_annual]
names_monthly = [file_names_monthly.replace('weight_monthly_', '').replace('.csv', '') for file_names_monthly in file_names_monthly]

weights_annual = {}
for i in range(len(file_names_annual)):
    file = file_names_annual[i]
    strategy_name = names_annual[i]
    tmp_weight = pd.read_csv("portfolio_weights/" + file)
    tmp_weight["Date"] = pd.to_datetime(tmp_weight["Date"])
    tmp_weight = tmp_weight.pivot(index = "Date", columns = "Ticker", values = "Weight")[::252]
    weights_annual[strategy_name] = tmp_weight
    
weights_monthly = {}
for i in range(len(file_names_monthly)):
    file = file_names_monthly[i]
    strategy_name = names_monthly[i]
    tmp_weight = pd.read_csv("portfolio_weights/" + file)
    tmp_weight["Date"] = pd.to_datetime(tmp_weight["Date"])
    tmp_weight = tmp_weight.pivot(index = "Date", columns = "Ticker", values = "Weight")[::21]
    weights_monthly[strategy_name] = tmp_weight

# Symbolic Regression on Annual Weight

In [44]:
x = ff5_annual
y = weights_annual["sae3CNNlstm"]

# # Note: there is some randomness; thus using from_file
# model = PySRRegressor(
#     niterations=40,  # < Increase me for better results
#     binary_operators=["+", "-", "*", "/"],
#     extra_sympy_mappings={"inv": lambda x: 1 / x},
#     loss="loss(prediction, target) = (prediction - target)^2",
#     progress=False
# )

# model.fit(x, y)

In [41]:
# model = PySRRegressor.from_file("pysr_result_macro_factors.pkl")
model = PySRRegressor.from_file("pysr_model_FF5.pkl")

Checking if pysr_model_FF5.pkl exists...
Loading model from pysr_model_FF5.pkl


In [42]:
for i in range(len(model.equations)):
    best = model.get_best()[i]["equation"]
    print(f"{y.columns[i]} = " + best)

AGG = ((((((0.75528973 - Mkt_RF) - Mkt_RF) * Mkt_RF) * -2.64096) + (RMW / 0.48064393)) + (0.5782028 + SMB))
DBC = ((((Mkt_RF * ((-0.008375801 / SMB) + (HML / (-0.008375801 - 0.24041964)))) - -0.10228153) + CMA) / 1.0571132)
VIX = (-0.009205342 / (((-0.29511827 + CMA) + Mkt_RF) - ((SMB * 1.3917743) + 0.10031292)))
VTI = (0.19678046 / ((0.55597466 - SMB) + (((RMW * ((HML - CMA) / CMA)) + SMB) / 0.29991513)))


In [43]:
# error from pySR: (error is pretty small)
y_hat = model.predict(x)
(y_hat - y) / y

Ticker,AGG,DBC,VIX,VTI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-12-29,-0.061287,0.020485,-0.242524,-0.080327
2011-12-28,-0.066824,0.199545,0.878076,0.015965
2012-12-27,0.265642,0.011634,0.098146,0.055776
2013-12-26,0.032559,-0.234304,0.689265,-0.043566
2014-12-26,0.175227,-0.16264,-0.1291,0.037465
2015-12-24,-0.036823,-0.486224,-0.279445,0.228081
2016-12-23,-0.063642,0.132057,1.80133,-0.319096
2017-12-26,0.493512,-0.024566,0.470744,-0.190145
2018-12-24,-0.163108,0.113206,0.090495,-0.811119
2019-12-23,-0.195837,0.129068,0.022891,2.516073


# Symbolic Regression on Annual Weight (no division)

In [None]:
x = ff5_annual
y = weights_annual["sae3CNNlstm"]

# # Note: there is some randomness; thus using from_file
# model = PySRRegressor(
#     niterations=40,  # < Increase me for better results
#     binary_operators=["+", "-", "*"],
#     extra_sympy_mappings={"inv": lambda x: 1 / x},
#     loss="loss(prediction, target) = (prediction - target)^2",
#     progress=False
# )

# model.fit(x, y)

In [46]:
model = PySRRegressor.from_file("pysr_model_FF5_no_division.pkl")

Checking if pysr_model_FF5_no_division.pkl exists...
Loading model from pysr_model_FF5_no_division.pkl


In [47]:
for i in range(len(model.equations)):
    best = model.get_best()[i]["equation"]
    print(f"{y.columns[i]} = " + best)

AGG = ((((0.6754607 - ((HML * -25.817741) * (SMB - CMA))) + SMB) - Mkt_RF) + RMW)
DBC = ((0.085083224 - (((CMA * 31.874401) - Mkt_RF) * RMW)) + CMA)
VIX = (((Mkt_RF + ((Mkt_RF - HML) + Mkt_RF)) - HML) * ((0.17106752 - SMB) + (CMA + -0.06438767)))
VTI = (0.3470881 - ((CMA - ((HML - RMW) - CMA)) * ((1.2123634 - SMB) - SMB)))


In [50]:
# error from pySR: (error is pretty small)
y_hat = model.predict(x)
(y_hat - y) / y 

Ticker,AGG,DBC,VIX,VTI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-12-29,0.083154,0.09182,0.101087,-0.425661
2011-12-28,-0.008006,0.69255,0.811181,0.475605
2012-12-27,-0.09077,-0.101747,-0.068161,-0.127913
2013-12-26,-0.187768,0.891228,-0.092128,-0.015461
2014-12-26,0.663981,-0.354663,0.126115,-0.315932
2015-12-24,-0.009463,-0.023461,-0.521505,0.152047
2016-12-23,0.016583,-0.081971,-1.556228,0.097393
2017-12-26,-0.101047,0.218128,0.637878,-0.258752
2018-12-24,0.021384,0.587248,-1.111339,0.793614
2019-12-23,-0.207658,-0.432822,-0.069678,2.14486
