In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
import glob
from IPython.display import Latex
import IPython
import re


In [3]:
start_date = "2020-01-12"
end_date = "2022-10-23"

return_scale = 1

## VIX Data

In [4]:
d1_name ="Data/VIX_History*"
d1_filename = glob.glob(d1_name)[0]
d1= pd.read_csv(d1_filename)

d1.columns = d1.columns.str.upper()
d1['DATE'] = pd.to_datetime(d1["DATE"])
d1= d1.set_index("DATE")

d1["RETURNS"] = d1["CLOSE"] - d1["CLOSE"].shift(return_scale)
d1=d1.dropna()



d1_week = d1.resample("1W").sum()
d1.index=d1.index.strftime('%Y-%m-%d')



vix = d1.loc[start_date:end_date]
vix_week = d1_week[start_date:end_date]

returns = vix[["RETURNS"]]
week_returns = vix_week[["RETURNS"]]

vix = vix[["CLOSE"]]
vix.columns = ["VIX"]
vix_week = vix_week[["CLOSE"]]
vix_week.columns = ["VIX"]

## Metaculus Data

In [5]:
d2_name ="Data/2_Day_Smooth_Metaculus_Users_*"
d2_filename = glob.glob(d2_name)[0]
d2= pd.read_csv(d2_filename)
d2.columns = d2.columns.str.upper()
d2['DATE'] = pd.to_datetime(d2["DATE"])
d2= d2.set_index("DATE")
d2.columns = ["META"]
d2_week = d2.resample("1W").sum()


metaculus = d2.loc[start_date:end_date]
metaculus_week = d2_week.loc[start_date:end_date]




## Idealized Stream

In [6]:
partial_name = f'Data/Idealized*'
filename = glob.glob(partial_name)[0]
d1= pd.read_csv(filename)

d1['DATE'] = pd.to_datetime(d1["DATE"])
d1= d1.set_index("DATE")
d1_week = d1.resample("1W").sum()
d1.index=d1.index.strftime('%Y-%m-%d')


stream = d1.loc[start_date:end_date]["CLOSE"]
stream_week = d1_week[start_date:end_date]

## Governing Function

In [7]:
def governing_eq(data, response, lags, power = 5, alphas= 20):
    alphas = np.geomspace(1e-2,1e-1, 20)

    reg_data= response.copy()


    for i in range(len(data)):
        for k in lags[i]:
            col_name =  data[i].columns[0]
            df = data[i].shift(k)
            for j in range(power):
                reg_data[str(k) + "_LAG_" + col_name +"_"+str(j+1)] = df[col_name]**(j+1)

    reg_data = reg_data.dropna()

    X = reg_data.iloc[:,1:]
    Y = reg_data["RETURNS"]

    X_norm = X.copy()
    for i in X.columns:    
        X_norm[i] =(X[i] - X[i].mean()) / X[i].std() 

    Y_norm = (Y - Y.mean()) / Y.std() 

    max_score = 0
    opt_alpha = 0
    opt_terms = []
    opt_coef = {}

    for i in alphas:
        lassoreg = Lasso(alpha=i)
        lassoreg.fit(X_norm,Y_norm)
        coefs= lassoreg.coef_
        mask = coefs!=0
        lin_X = X.loc[:, mask]
        if not lin_X.empty:
            linreg = LinearRegression().fit(lin_X, Y)
            score = linreg.score(lin_X,Y)
            if score > max_score:
                opt_alpha = i
                opt_terms = lin_X.columns
                opt_coef = linreg.coef_
                max_score = score
    
    print("Max Score: {}".format(max_score))

    output = ""
    pat = r'.*?_(.*)_.*'

    for i in range(len(opt_terms)):
        lag = opt_terms[i].split('_')[0]
        power = opt_terms[i][-1]
        val = opt_terms[i].split('_')[2]
        
        if i != 0 and opt_coef[i] > 0:
            output += "+ "
        output +=  "{:.2e} \cdot {}_{{Lag: {} }}^{}".format(opt_coef[i], val, lag, power)

    IPython.display.display(Latex(f"""\\begin{{equation*}}
    {output}
    \\end{{equation*}}
    """))
    
    return opt_coef, opt_terms

## Parameters

In [None]:

data = [vix, metaculus, stream]
met_lags = np.arange(10,15)
vix_lags= np.arange(25,30)

lags = [vix_lags, met_lags]

In [13]:
governing_eq(data, returns, lags, power = 5)

Max Score: 0.019371392928780984


<IPython.core.display.Latex object>

(array([-2.94728903e-07, -6.62178650e-06,  1.26647977e-07, -6.53453242e-06,
        -1.84395004e-09,  7.59801288e-07,  4.16038316e-08, -3.41491395e-04,
         8.68063878e-16, -4.91279109e-20, -1.37030742e-04,  6.07954412e-08]),
 Index(['25_LAG_VIX_1', '26_LAG_VIX_3', '26_LAG_VIX_4', '27_LAG_VIX_2',
        '28_LAG_VIX_5', '29_LAG_VIX_3', '29_LAG_VIX_4', '10_LAG_META_1',
        '11_LAG_META_4', '12_LAG_META_5', '13_LAG_META_1', '13_LAG_META_2'],
       dtype='object'))

In [15]:

data = [vix, metaculus]
met_lags = np.arange(10,15)
vix_lags= np.arange(25,30)

lags = [vix_lags, met_lags]

response = stream

power = 5
alphas= 20

alphas = np.geomspace(1e-2,1e-1, 20)

reg_data= response.copy()


for i in range(len(data)):
    for k in lags[i]:
        col_name =  data[i].columns[0]
        df = data[i].shift(k)
        for j in range(power):
            reg_data[str(k) + "_LAG_" + col_name +"_"+str(j+1)] = df[col_name]**(j+1)

reg_data = reg_data.dropna()

X = reg_data.iloc[:,1:]
Y = reg_data["CLOSE"]

X_norm = X.copy()
for i in X.columns:    
    X_norm[i] =(X[i] - X[i].mean()) / X[i].std() 

Y_norm = (Y - Y.mean()) / Y.std() 

max_score = 0
opt_alpha = 0
opt_terms = []
opt_coef = {}

for i in alphas:
    lassoreg = Lasso(alpha=i)
    lassoreg.fit(X_norm,Y_norm)
    coefs= lassoreg.coef_
    mask = coefs!=0
    lin_X = X.loc[:, mask]
    if not lin_X.empty:
        linreg = LinearRegression().fit(lin_X, Y)
        score = linreg.score(lin_X,Y)
        if score > max_score:
            opt_alpha = i
            opt_terms = lin_X.columns
            opt_coef = linreg.coef_
            max_score = score

print("Max Score: {}".format(max_score))

output = ""
pat = r'.*?_(.*)_.*'

for i in range(len(opt_terms)):
    lag = opt_terms[i].split('_')[0]
    power = opt_terms[i][-1]
    val = opt_terms[i].split('_')[2]
    
    if i != 0 and opt_coef[i] > 0:
        output += "+ "
    output +=  "{:.2e} \cdot {}_{{Lag: {} }}^{}".format(opt_coef[i], val, lag, power)

IPython.display.display(Latex(f"""\\begin{{equation*}}
{output}
\\end{{equation*}}
"""))

Max Score: 0.11165862294570572


<IPython.core.display.Latex object>