In [1]:
import matplotlib.pyplot as plt   
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from tqdm import tqdm
import warnings

#ignore by message
warnings.filterwarnings("ignore", message="Maximum number of iterations")
warnings.filterwarnings("ignore", message="divide by zero encountered")
warnings.filterwarnings("ignore", message="invalid value encountered in matmul")

In [2]:
sp500_hist = pd.read_csv(
    "./data/sp500_hist_prices.csv",
    parse_dates=['Date'],
    index_col='Date'
)

sp500_hist = sp500_hist.loc['2014-01-01':,]
sp500_hist.dropna(axis=1, inplace=True)


print(f'Shape: {sp500_hist.shape}')
sp500_hist.head()

Shape: (2266, 470)


Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABBV,ABC,ABT,ACGL,ACN,ADBE,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-02,37.141602,23.907921,101.185295,17.388981,35.240082,60.504734,31.847378,19.123333,68.658348,59.290001,...,169.137527,20.609207,66.582367,44.643036,30.301586,45.261662,83.170258,53.18,24.822664,30.329689
2014-01-03,37.610752,25.020359,104.080498,17.007023,35.457024,60.548027,32.188923,18.756666,68.886833,59.16,...,167.479828,20.601709,66.422142,44.857124,30.576576,45.544949,83.530914,53.580002,24.998468,30.039131
2014-01-06,37.425739,25.482304,103.084709,17.09976,34.16214,60.331593,32.613773,18.58,68.159042,58.119999,...,167.360214,20.489336,66.522278,44.587177,30.52335,45.508789,84.071892,53.400002,24.822664,29.973518
2014-01-07,37.960957,25.369169,104.357124,16.977465,34.229923,60.989532,32.363853,18.586666,68.988396,58.970001,...,172.188049,20.841444,67.463448,45.276005,30.612062,46.147724,85.749016,53.950001,24.898008,30.085989
2014-01-08,38.582062,26.047947,103.545738,17.08498,34.141792,61.586887,32.655426,18.476667,69.521561,58.900002,...,175.417999,20.759035,67.243179,45.359783,30.594316,46.129646,87.849907,53.91,25.115677,29.748577


In [3]:
TRAIN_START_DATE, TRAIN_END_DATE = "2014-01-01", "2019-01-01"
VAL_START_DATE, VAL_END_DATE = TRAIN_END_DATE, "2021-01-01"
TEST_START_DATE, TEST_END_DATE = VAL_END_DATE, "2022-12-30"

df_train = sp500_hist.loc[TRAIN_START_DATE:TRAIN_END_DATE,:]
df_val = sp500_hist.loc[VAL_START_DATE:VAL_END_DATE,:]
df_test = sp500_hist.loc[TEST_START_DATE:TEST_END_DATE,:]

print(f"Train shape: {df_train.shape}")
print(f"Validation shape: {df_val.shape}")
print(f"Test shape: {df_test.shape}")

assert df_train.shape[1] == df_val.shape[1] and df_val.shape[1] == df_test.shape[1]


Train shape: (1258, 470)
Validation shape: (505, 470)
Test shape: (503, 470)


In [4]:
df_train

Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABBV,ABC,ABT,ACGL,ACN,ADBE,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-02,37.141602,23.907921,101.185295,17.388981,35.240082,60.504734,31.847378,19.123333,68.658348,59.290001,...,169.137527,20.609207,66.582367,44.643036,30.301586,45.261662,83.170258,53.180000,24.822664,30.329689
2014-01-03,37.610752,25.020359,104.080498,17.007023,35.457024,60.548027,32.188923,18.756666,68.886833,59.160000,...,167.479828,20.601709,66.422142,44.857124,30.576576,45.544949,83.530914,53.580002,24.998468,30.039131
2014-01-06,37.425739,25.482304,103.084709,17.099760,34.162140,60.331593,32.613773,18.580000,68.159042,58.119999,...,167.360214,20.489336,66.522278,44.587177,30.523350,45.508789,84.071892,53.400002,24.822664,29.973518
2014-01-07,37.960957,25.369169,104.357124,16.977465,34.229923,60.989532,32.363853,18.586666,68.988396,58.970001,...,172.188049,20.841444,67.463448,45.276005,30.612062,46.147724,85.749016,53.950001,24.898008,30.085989
2014-01-08,38.582062,26.047947,103.545738,17.084980,34.141792,61.586887,32.655426,18.476667,69.521561,58.900002,...,175.417999,20.759035,67.243179,45.359783,30.594316,46.129646,87.849907,53.910000,25.115677,29.748577
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-24,60.658115,29.247074,138.501755,35.375175,68.045486,66.140617,60.909603,24.799999,125.596710,205.160004,...,88.352982,42.943733,52.057148,33.934464,58.313515,80.661499,91.214142,142.899994,33.992252,77.151306
2018-12-26,63.435974,31.776182,144.184326,37.866352,71.991089,68.271782,64.681641,25.920000,130.614197,222.949997,...,95.286011,43.792011,54.544388,35.590034,61.059139,83.414490,95.707779,152.520004,35.757629,80.693588
2018-12-27,64.345779,31.530161,143.868149,37.620605,72.694519,68.729797,65.619980,26.540001,131.929657,225.139999,...,94.220131,44.165985,54.782780,35.607304,62.085175,84.313622,95.641960,155.350006,35.713490,82.065712
2018-12-28,64.000374,31.323500,144.584259,37.639877,73.672836,69.131714,66.047348,26.389999,131.375290,223.130005,...,93.375107,43.854332,54.170898,35.588116,62.037682,84.916122,96.525887,155.970001,35.687012,82.221420


In [5]:
assets = df_train.columns.to_list()
n_assets = len(assets)
q = 0.2
df_corr = pd.DataFrame(
    data=np.zeros(shape=(n_assets, n_assets)),
    index=assets,
    columns=assets)
parameters = []

for i in tqdm(assets):
    for j in assets:
        # beta_1.2
        mod_12 = smf.quantreg(f'{j} ~ {i}', df_train) 
        res_12 = mod_12.fit(q=q) 
        beta_12 = res_12.params.loc[i]
        
        # beta_2.1
        mod_21 = smf.quantreg(f'{i} ~ {j}', df_train) 
        res_21 = mod_21.fit(q=q) 
        beta_21 = res_21.params.loc[j]

        # tau quantile correlation
        rho = np.sign(beta_21) * np.sqrt(beta_21 * beta_12) if beta_21 * beta_12 >= 0 else 0
        
        # save values
        parameters.append((i, j, beta_12, beta_21, np.sign(beta_21), rho))
        df_corr.at[i, j] = rho
        

  d = np.where(e > 0, (q/fhat0)**2, ((1-q)/fhat0)**2)
  vcov = xtxi @ xtdx @ xtxi
  lfit.sparsity = 1. / fhat0
  d = np.where(e > 0, (q/fhat0)**2, ((1-q)/fhat0)**2)
  vcov = xtxi @ xtdx @ xtxi
  lfit.sparsity = 1. / fhat0
  d = np.where(e > 0, (q/fhat0)**2, ((1-q)/fhat0)**2)
  vcov = xtxi @ xtdx @ xtxi
  lfit.sparsity = 1. / fhat0
  d = np.where(e > 0, (q/fhat0)**2, ((1-q)/fhat0)**2)
  vcov = xtxi @ xtdx @ xtxi
  lfit.sparsity = 1. / fhat0
  d = np.where(e > 0, (q/fhat0)**2, ((1-q)/fhat0)**2)
  vcov = xtxi @ xtdx @ xtxi
  lfit.sparsity = 1. / fhat0
  d = np.where(e > 0, (q/fhat0)**2, ((1-q)/fhat0)**2)
  vcov = xtxi @ xtdx @ xtxi
  lfit.sparsity = 1. / fhat0
  d = np.where(e > 0, (q/fhat0)**2, ((1-q)/fhat0)**2)
  vcov = xtxi @ xtdx @ xtxi
  lfit.sparsity = 1. / fhat0
  d = np.where(e > 0, (q/fhat0)**2, ((1-q)/fhat0)**2)
  vcov = xtxi @ xtdx @ xtxi
  lfit.sparsity = 1. / fhat0
  d = np.where(e > 0, (q/fhat0)**2, ((1-q)/fhat0)**2)
  vcov = xtxi @ xtdx @ xtxi
  lfit.sparsity = 1. / fhat0
 

In [14]:
# Save correlation matrix
df_corr.round(10).to_csv(f"./data/correlation_matrix_q{str(q).replace('.', '')}.csv")