In [1]:
import matplotlib.pyplot as plt   
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from tqdm import tqdm
import warnings

#ignore by message
warnings.filterwarnings("ignore", message="Maximum number of iterations")
warnings.filterwarnings("ignore", message="divide by zero encountered")
warnings.filterwarnings("ignore", message="invalid value encountered in matmul")

In [2]:
sp500_hist = pd.read_csv(
    "./data/sp500_hist_prices.csv",
    parse_dates=['Date'],
    index_col='Date'
)

sp500_hist = sp500_hist.loc['2014-01-01':,]
sp500_hist.dropna(axis=1, inplace=True)


print(f'Shape: {sp500_hist.shape}')
sp500_hist.head()

Shape: (2266, 470)


Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABBV,ABC,ABT,ACGL,ACN,ADBE,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-02,37.141602,23.907921,101.185295,17.388981,35.240082,60.504734,31.847378,19.123333,68.658348,59.290001,...,169.137527,20.609207,66.582367,44.643036,30.301586,45.261662,83.170258,53.18,24.822664,30.329689
2014-01-03,37.610752,25.020359,104.080498,17.007023,35.457024,60.548027,32.188923,18.756666,68.886833,59.16,...,167.479828,20.601709,66.422142,44.857124,30.576576,45.544949,83.530914,53.580002,24.998468,30.039131
2014-01-06,37.425739,25.482304,103.084709,17.09976,34.16214,60.331593,32.613773,18.58,68.159042,58.119999,...,167.360214,20.489336,66.522278,44.587177,30.52335,45.508789,84.071892,53.400002,24.822664,29.973518
2014-01-07,37.960957,25.369169,104.357124,16.977465,34.229923,60.989532,32.363853,18.586666,68.988396,58.970001,...,172.188049,20.841444,67.463448,45.276005,30.612062,46.147724,85.749016,53.950001,24.898008,30.085989
2014-01-08,38.582062,26.047947,103.545738,17.08498,34.141792,61.586887,32.655426,18.476667,69.521561,58.900002,...,175.417999,20.759035,67.243179,45.359783,30.594316,46.129646,87.849907,53.91,25.115677,29.748577


In [3]:
TRAIN_START_DATE, TRAIN_END_DATE = "2014-01-01", "2019-01-01"
VAL_START_DATE, VAL_END_DATE = TRAIN_END_DATE, "2021-01-01"
TEST_START_DATE, TEST_END_DATE = VAL_END_DATE, "2022-12-30"

df_train = sp500_hist.loc[TRAIN_START_DATE:TRAIN_END_DATE,:]
df_val = sp500_hist.loc[VAL_START_DATE:VAL_END_DATE,:]
df_test = sp500_hist.loc[TEST_START_DATE:TEST_END_DATE,:]

print(f"Train shape: {df_train.shape}")
print(f"Validation shape: {df_val.shape}")
print(f"Test shape: {df_test.shape}")

assert df_train.shape[1] == df_val.shape[1] and df_val.shape[1] == df_test.shape[1]


Train shape: (1258, 470)
Validation shape: (505, 470)
Test shape: (503, 470)


In [4]:
# df_train

In [5]:
assets = df_train.columns.to_list()
n_assets = len(assets)
q = 0.1
df_corr = pd.DataFrame(
    data=np.zeros(shape=(n_assets, n_assets)),
    index=assets,
    columns=assets)
parameters = []

for i in tqdm(assets):
    for j in assets:
        # beta_1.2
        mod_12 = smf.quantreg(f'{j} ~ {i}', df_train) 
        res_12 = mod_12.fit(q=q) 
        beta_12 = res_12.params.loc[i]
        
        # beta_2.1
        mod_21 = smf.quantreg(f'{i} ~ {j}', df_train) 
        res_21 = mod_21.fit(q=q) 
        beta_21 = res_21.params.loc[j]

        # tau quantile correlation
        rho = np.sign(beta_21) * np.sqrt(beta_21 * beta_12) if beta_21 * beta_12 >= 0 else 0
        
        # save values
        parameters.append((i, j, beta_12, beta_21, np.sign(beta_21), rho))
        df_corr.at[i, j] = rho

100%|██████████| 470/470 [2:12:29<00:00, 16.91s/it]  


In [6]:
# Save correlation matrix
df_corr.round(10).to_csv(f"./data/correlation_matrix_q{str(q).replace('.', '')}.csv")

In [8]:
df_corr.isnull().any().any()

False

In [9]:
df_corr.max().max()

1.3062638779529272

In [10]:
df_corr.min().min()

-0.938849710837586

In [12]:
df_corr.mean().mean()

0.4108686513539695

In [11]:
df_corr

Unnamed: 0,A,AAL,AAP,AAPL,ABBV,ABC,ABT,ACGL,ACN,ADBE,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
A,1.000000,0.126031,-0.245360,0.882155,0.800842,0.000000,0.782772,0.686253,0.962686,0.761621,...,0.162596,0.879218,0.162189,0.000000,0.883745,0.922799,0.455253,0.519161,0.836153,0.887999
AAL,0.126031,1.000000,-0.110208,0.299902,0.259007,0.548440,0.172539,0.284371,0.141369,0.103587,...,0.064629,0.128738,0.000000,0.293382,0.076207,0.128113,0.211092,0.248765,0.111779,0.219982
AAP,-0.245360,-0.110208,1.000000,0.000000,0.000000,0.529688,0.000000,0.000000,0.000000,0.000000,...,-0.651962,0.000000,0.000000,-0.450540,-0.186514,0.000000,0.000000,-0.281047,-0.366653,0.000000
AAPL,0.882155,0.299902,0.000000,1.000000,0.667970,0.416025,0.783243,0.622700,0.888316,0.990407,...,0.000000,0.642576,0.000000,0.000000,0.895250,0.909737,0.425223,0.847059,0.893100,0.871727
ABBV,0.800842,0.259007,0.000000,0.667970,1.000000,0.302055,0.692231,0.473762,0.813505,0.744662,...,0.000000,0.556391,0.000000,0.000000,0.868733,0.746978,0.422058,0.644879,0.780229,0.764553
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YUM,0.922799,0.128113,0.000000,0.909737,0.746978,0.256988,0.904385,0.563274,0.979810,0.849378,...,0.000000,0.891992,0.000000,0.000000,0.886790,1.000000,0.480312,0.885237,0.831561,0.977130
ZBH,0.455253,0.211092,0.000000,0.425223,0.422058,0.379061,0.320668,0.496664,0.500989,0.311979,...,0.000000,0.556899,0.000000,0.000000,0.376052,0.480312,1.000000,0.228439,0.326525,0.484123
ZBRA,0.519161,0.248765,-0.281047,0.847059,0.644879,0.417030,0.923303,0.552885,0.779237,0.531213,...,0.172692,0.383697,0.000000,-0.232785,0.399998,0.885237,0.228439,1.000000,0.632881,1.028497
ZION,0.836153,0.111779,-0.366653,0.893100,0.780229,0.058331,0.632676,0.730576,1.020146,0.734847,...,0.265824,0.515079,0.209662,-0.261024,0.909633,0.831561,0.326525,0.632881,1.000000,0.920427
