In [1]:
import os, sys
import pandas as pd
import numpy as np
import glob
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import itertools
from tqdm.auto import tqdm
from multiprocessing import Pool
from stargazer.stargazer import Stargazer
from adjustText import adjust_text
import ipywidgets as widgets

sns.set_context("paper", font_scale=1.7)

# Load Data

In [2]:
## Stock info
info_df = pd.read_feather('../../../HFZoo/data/keys/stock_universe.feather')

In [3]:
%%time

# Folder with clean prices for all stocks
data_folder = '../../data/proc/clean_prices/'

# Read all files in data folder
filter_ds = pq.ParquetDataset(
    glob.glob(data_folder + '*.parquet'),
    metadata = pq.read_metadata(data_folder + '_metadata')
)
rawdata_df = filter_ds.read_pandas().to_pandas()

CPU times: user 6.11 s, sys: 8.48 s, total: 14.6 s
Wall time: 31.6 s


## Clean

In [4]:
%%time

# Filter data
data_df = rawdata_df.sort_values(by = ['date', 'permno', 'datetime']).copy()
data_df['datetime'] = pd.to_datetime(data_df['datetime'])
data_df['ticker'] = data_df['symbol'].astype(str).str.replace('.', '', regex=False)

# Set aside returns with merged CRSP overnight returns
# and set main returns to just intradaily
data_df['return'] = np.log(1+data_df['return'])
data_df = data_df.rename(columns={"return": "return_merge"})
data_df["return"] = np.where(
    data_df["datetime"].dt.time.astype(str) == "09:30:00",
    np.nan, data_df['return_merge']
)

CPU times: user 11.3 s, sys: 4.58 s, total: 15.9 s
Wall time: 15.8 s


In [5]:
%%time

## Deal with missing data for holidays
# Get list of half-days in the data
temp_df = data_df.query('symbol in ("SPY", "SPY.")').copy()
temp_df["retdiff"] = temp_df["return"].diff(1)
half_days = (
    temp_df.query("retdiff==0")
    .groupby(["date"])["retdiff"]
    .count()
    .sort_values()
    .reset_index()
    .query("retdiff >= 20")["date"]
)

# Drop these days from return series
data_df["return"] = np.where(
    data_df["date"].isin(half_days),
    np.nan,
    data_df["return"],
)

# Add time aggregator
time_agg_df = pd.DataFrame(np.sort(data_df['date'].unique()), columns = ['date']).reset_index()
time_agg_df['trading_week_id'] = np.floor(time_agg_df['index'] / 5).astype(int)
time_agg_df['trading_month_id'] = np.floor(time_agg_df['index'] / 21).astype(int)
time_agg_df['trading_week_datetime'] = time_agg_df.groupby(['trading_week_id'])['date'].last()
time_agg_df['trading_month_datetime'] = time_agg_df.groupby(['trading_month_id'])['date'].last()
data_df = data_df.merge(time_agg_df.drop(['index'], axis = 1), on = ['date'], how = 'left')

CPU times: user 1.64 s, sys: 2.78 s, total: 4.42 s
Wall time: 4.45 s


In [6]:
%%time
# Save ETF results
etf_tickers = ["SPY", "SPYV", "VFVA", "VFMO", "QUAL", "MTUM", "SIZE", "IWM"]
etf_permnos = (
    info_df.query("ticker in @etf_tickers")
    .groupby(["permno"])[["ticker", "dt", "shrcd", "exchcd"]]
    .last()
    .query("shrcd == 73")
    .index
).astype(int).astype(str).astype('category')
data_df.query('permno in @etf_permnos').reset_index(drop = True).to_feather('../../data/temp/etf_prices.feather')

CPU times: user 1.02 s, sys: 1.3 s, total: 2.31 s
Wall time: 2.2 s


## Set up Covid Clusters

In [7]:
# Cluster by ticker
cluster_bad_permnos = [
    19561, # "BA"  
    34833, # "OXY" 
    17830, # "RTX" 
    14277, # "SLB" 
    80100, # "SPG"
]
cluster_good_permnos = [76841, # "BIIB" 
                        86783, # "BKNG" 
                        12308, # "CHTR" 
                        77274  # "GILD"
                       ]
cluster_home_permnos = [84788, # "AMZN" 
                        89393  # "NFLX"
                       ]
other_permnos = [13407, # "FB" 
                 84788, # "AMZN"
                 14593, # "AAPL"
                 89393, # "NFLX" 
                 14542  # "GOOG"
                ]

# Create dataframe of cluster classifications
cluster_class_df = pd.DataFrame(
    cluster_bad_permnos + cluster_good_permnos + cluster_home_permnos + other_permnos, columns=["permno"]
).drop_duplicates().astype({'permno': 'str'})

# Add PERMNOs
cluster_class_df = cluster_class_df.merge(
    data_df.groupby(["permno"])[["ticker", "symbol"]].last().reset_index(),
    on=["permno"],
    how="left",
)

# Add cluster types
cluster_class_df["class"] = np.select(
    [
        cluster_class_df["permno"].astype(int).isin(cluster_bad_permnos),
        cluster_class_df["permno"].astype(int).isin(cluster_home_permnos),
        cluster_class_df["permno"].astype(int).isin(cluster_good_permnos),
    ],
    ["bad", "home", "good"],
    default=None,
)
cluster_class_df['is_faang'] = np.where(cluster_class_df['ticker'].str.contains('FB|AAPL|AMZN|NFLX|GOOG'), 1, 0)
cluster_class_df.drop(['symbol'], axis= 1)

Unnamed: 0,permno,ticker,class,is_faang
0,19561,BA,bad,0
1,34833,OXY,bad,0
2,17830,RTX,bad,0
3,14277,SLB,bad,0
4,80100,SPG,bad,0
5,76841,BIIB,good,0
6,86783,BKNG,good,0
7,12308,CHTR,good,0
8,77274,GILD,good,0
9,84788,AMZN,home,1


## Create portfolios

In [8]:
# Portfolios
temp_df = data_df.merge(
    cluster_class_df[["permno", "class", "is_faang"]], on=["permno"], how="left"
)

# COVID
covid_port_df = (
    temp_df.groupby(["class", "datetime"])[["return", "return_merge"]]
    .mean()
    .reset_index()
    .rename(columns={"class": "ticker"})
)
covid_port_df["ticker"] = covid_port_df["ticker"].str.title()
covid_port_df["permno"] = covid_port_df["ticker"].str.title()

# FAANG
faang_port_df = (
    temp_df.groupby(["is_faang", "datetime"])[["return", "return_merge"]]
    .mean()
    .reset_index()
    .rename(columns={"class": "ticker"})
)
faang_port_df = faang_port_df.query("is_faang == 1")[["datetime", "return", "return_merge"]]
faang_port_df[["ticker", "permno"]] = "FAANG"

# SPY
spy_df = temp_df.query('permno == "84398"').copy()
spy_df = spy_df[["datetime", "permno", "return", "return_merge"]]
spy_df["ticker"] = "SPY"

port_df = pd.concat([covid_port_df, faang_port_df, spy_df], axis=0)

## Save Results

In [9]:
data_df.reset_index(drop = True).to_feather('../../data/proc/all_hf.feather')
port_df.reset_index(drop = True).to_feather('../../data/proc/port_hf.feather')
cluster_class_df.to_feather('../../data/proc/cluster_classifications.feather')

# Process Data

## Compute Volatility Measures

#### Main

In [10]:
%%time
# Add intermediate columns to speed up calculations
data_df['return_pos'] = data_df['return']*(data_df['return'] > 0)
data_df['return_neg'] = data_df['return']*(data_df['return'] < 0)
data_df['return_sq'] = np.square(data_df['return'])
data_df['return_pos_sq'] = np.square(data_df['return']*(data_df['return'] > 0))
data_df['return_neg_sq'] = np.square(data_df['return']*(data_df['return'] < 0))

# Get daily results
daily_data_df = (
    data_df.groupby(['permno', pd.Grouper(key = 'datetime', freq = '1d', label = 'right')], observed = True)
    [['return_sq', 'return_pos_sq', 'return_neg_sq', 'return_merge']]
    .sum()
    .reset_index()
)
daily_data_df.rename(
    columns = {'return_sq': 'real_var', 'return_pos_sq': 'real_var_pos', 'return_neg_sq': 'real_var_neg', 'return_merge': 'log_return'}, 
    inplace = True
)

# Add datetime info
daily_data_df['date'] = pd.to_datetime(daily_data_df['datetime'].dt.date)
daily_data_df = daily_data_df.merge(time_agg_df.drop(['index'], axis = 1), on = ['date'], how = 'left')


# Add weekly and monthly RV
daily_data_df = daily_data_df.sort_values(by = ['permno', 'datetime'])
daily_data_df['real_var_weekly'] = daily_data_df.groupby(['permno'])['real_var'].transform(
    lambda x: x.rolling(5, min_periods = 5).sum())
daily_data_df['real_var_monthly'] = daily_data_df.groupby(['permno'])['real_var'].transform(
    lambda x: x.rolling(21, min_periods = 21).sum())
daily_data_df['log_return_weekly'] = daily_data_df.groupby(['permno'])['log_return'].transform(
    lambda x: x.rolling(5, min_periods = 5).sum())
daily_data_df['log_return_monthly'] = daily_data_df.groupby(['permno'])['log_return'].transform(
    lambda x: x.rolling(21, min_periods = 21).sum())

# Add other RV based values
daily_data_df['real_var_lead'] = daily_data_df.groupby(['permno'])['real_var'].shift(-1)
daily_data_df['log_return_lead'] = daily_data_df.groupby(['permno'])['log_return'].shift(-1)
daily_data_df['real_vol'] = np.sqrt(daily_data_df['real_var'])
daily_data_df['real_vol_pos'] = np.sqrt(daily_data_df['real_var_pos'])
daily_data_df['real_vol_neg'] = np.sqrt(daily_data_df['real_var_neg'])
daily_data_df['SJ'] = daily_data_df['real_var_pos'] - daily_data_df['real_var_neg']
daily_data_df['SJs'] = daily_data_df['real_vol_pos'] - daily_data_df['real_vol_neg']
daily_data_df['SJ_10'] = daily_data_df['SJ']*10

# Save SPY data for other notebook
daily_data_df.query('permno == "84398"').reset_index(drop = True).to_feather('../../data/proc/SPY_daily.feather')

CPU times: user 2.52 s, sys: 4.47 s, total: 6.98 s
Wall time: 6.34 s


In [11]:
%%time
# Get weekly results
weekly_data_df = (
    daily_data_df.groupby(['permno',  pd.Grouper(key = 'datetime', freq = '1w')], observed = True)
    [['real_var', 'real_var_pos', 'real_var_neg', 'log_return']]
    .mean()
    .reset_index()
)
# weekly_data_df['datetime'] = pd.to_datetime(weekly_data_df['trading_week_datetime'], format = '%Y%m%d')

# Add weekly and weekly RV
weekly_data_df = weekly_data_df.sort_values(by = ['permno', 'datetime'])
# weekly_data_df['real_var_weekly'] = weekly_data_df.groupby(['permno'])['real_var'].transform(
#     lambda x: x.rolling(5, min_periods = 5).sum())
# weekly_data_df['real_var_monthly'] = weekly_data_df.groupby(['permno'])['real_var'].transform(
#     lambda x: x.rolling(22, min_periods = 22).sum())
# weekly_data_df['log_return_weekly'] = weekly_data_df.groupby(['permno'])['log_return'].transform(
#     lambda x: x.rolling(5, min_periods = 5).sum())
# weekly_data_df['log_return_monthly'] = weekly_data_df.groupby(['permno'])['log_return'].transform(
#     lambda x: x.rolling(22, min_periods = 22).sum())

# Add other RV based values
weekly_data_df['real_var_lead'] = weekly_data_df.groupby(['permno'])['real_var'].shift(-1)
weekly_data_df['log_return_lead'] = weekly_data_df.groupby(['permno'])['log_return'].shift(-1)
weekly_data_df['real_vol'] = np.sqrt(weekly_data_df['real_var'])
weekly_data_df['real_vol_pos'] = np.sqrt(weekly_data_df['real_var_pos'])
weekly_data_df['real_vol_neg'] = np.sqrt(weekly_data_df['real_var_neg'])
weekly_data_df['SJ'] = weekly_data_df['real_var_pos'] - weekly_data_df['real_var_neg']
weekly_data_df['SJs'] = weekly_data_df['real_vol_pos'] - weekly_data_df['real_vol_neg']
weekly_data_df['SJ_10'] = weekly_data_df['SJ']*10

CPU times: user 31.2 ms, sys: 0 ns, total: 31.2 ms
Wall time: 40.8 ms


In [12]:
%%time
# Get monthly results
monthly_data_df = (
    daily_data_df.groupby(['permno', pd.Grouper(key = 'datetime', freq = '1m')], observed = True)
    [['real_var', 'real_var_pos', 'real_var_neg', 'log_return']]
    .mean()
    .reset_index()
)
# monthly_data_df['datetime'] = pd.to_datetime(monthly_data_df['trading_month_datetime'], format = '%Y%m%d')
monthly_data_df.rename(
    columns = {'return_sq': 'real_var', 'return_pos_sq': 'real_var_pos', 'return_neg_sq': 'real_var_neg', 'return_merge': 'log_return'}, 
    inplace = True
)

# Add weekly and monthly RV
monthly_data_df = monthly_data_df.sort_values(by = ['permno', 'datetime'])
# monthly_data_df['real_var_weekly'] = monthly_data_df.groupby(['permno'])['real_var'].transform(
#     lambda x: x.rolling(5, min_periods = 5).sum())
# monthly_data_df['real_var_monthly'] = monthly_data_df.groupby(['permno'])['real_var'].transform(
#     lambda x: x.rolling(22, min_periods = 22).sum())
# monthly_data_df['log_return_weekly'] = monthly_data_df.groupby(['permno'])['log_return'].transform(
#     lambda x: x.rolling(5, min_periods = 5).sum())
# monthly_data_df['log_return_monthly'] = monthly_data_df.groupby(['permno'])['log_return'].transform(
#     lambda x: x.rolling(22, min_periods = 22).sum())

# Add other RV based values
monthly_data_df['real_var_lead'] = monthly_data_df.groupby(['permno'])['real_var'].shift(-1)
monthly_data_df['log_return_lead'] = monthly_data_df.groupby(['permno'])['log_return'].shift(-1)
monthly_data_df['real_vol'] = np.sqrt(monthly_data_df['real_var'])
monthly_data_df['real_vol_pos'] = np.sqrt(monthly_data_df['real_var_pos'])
monthly_data_df['real_vol_neg'] = np.sqrt(monthly_data_df['real_var_neg'])
monthly_data_df['SJ'] = monthly_data_df['real_var_pos'] - monthly_data_df['real_var_neg']
monthly_data_df['SJs'] = monthly_data_df['real_vol_pos'] - monthly_data_df['real_vol_neg']
monthly_data_df['SJ_10'] = monthly_data_df['SJ']*10

CPU times: user 31.2 ms, sys: 15.6 ms, total: 46.9 ms
Wall time: 28.6 ms


In [13]:
%%time
# Get hourly results
hourly_data_df = (
    data_df.groupby(['permno', pd.Grouper(key = 'datetime', freq = '1h', label = 'right')], observed = True)
    [['return_sq', 'return_pos_sq', 'return_neg_sq', 'return']]
    .sum()
    .reset_index()
)
hourly_data_df['datetime'] = pd.to_datetime(hourly_data_df['datetime'], format = '%Y%m%d')
hourly_data_df.rename(
    columns = {'return_sq': 'real_var', 'return_pos_sq': 'real_var_pos', 'return_neg_sq': 'real_var_neg', 'return': 'log_return'}, 
    inplace = True
)

# Add weekly and hourly RV
hourly_data_df = hourly_data_df.sort_values(by = ['permno', 'datetime'])
# hourly_data_df['real_var_5'] = hourly_data_df.groupby(['permno'])['real_var'].transform(
#     lambda x: x.rolling(5, min_periods = 5).sum())
# hourly_data_df['real_var_22'] = hourly_data_df.groupby(['permno'])['real_var'].transform(
#     lambda x: x.rolling(22, min_periods = 22).sum())
# hourly_data_df['log_return_5'] = hourly_data_df.groupby(['permno'])['log_return'].transform(
#     lambda x: x.rolling(5, min_periods = 5).sum())
# hourly_data_df['log_return_22'] = hourly_data_df.groupby(['permno'])['log_return'].transform(
#     lambda x: x.rolling(22, min_periods = 22).sum())

# Add other RV based values
hourly_data_df['real_var_lead'] = hourly_data_df.groupby(['permno'])['real_var'].shift(-1)
hourly_data_df['log_return_lead'] = hourly_data_df.groupby(['permno'])['log_return'].shift(-1)
hourly_data_df['real_vol'] = np.sqrt(hourly_data_df['real_var'])
hourly_data_df['real_vol_pos'] = np.sqrt(hourly_data_df['real_var_pos'])
hourly_data_df['real_vol_neg'] = np.sqrt(hourly_data_df['real_var_neg'])
hourly_data_df['SJ'] = hourly_data_df['real_var_pos'] - hourly_data_df['real_var_neg']
hourly_data_df['SJs'] = hourly_data_df['real_vol_pos'] - hourly_data_df['real_vol_neg']
hourly_data_df['SJ_10'] = hourly_data_df['SJ']*10

CPU times: user 2.42 s, sys: 2.14 s, total: 4.56 s
Wall time: 4.57 s


#### Portfolios

In [14]:
%%time
# Add intermediate columns to speed up calculations
port_df['return_pos'] = port_df['return']*(port_df['return'] > 0)
port_df['return_neg'] = port_df['return']*(port_df['return'] < 0)
port_df['return_sq'] = np.square(port_df['return'])
port_df['return_pos_sq'] = np.square(port_df['return']*(port_df['return'] > 0))
port_df['return_neg_sq'] = np.square(port_df['return']*(port_df['return'] < 0))

# Get daily results
daily_port_df = (
    port_df.groupby(['permno', pd.Grouper(key = 'datetime', freq = '1d', label = 'right')], observed = True)
    [['return_sq', 'return_pos_sq', 'return_neg_sq', 'return_merge']]
    .sum()
    .reset_index()
)
daily_port_df.rename(
    columns = {'return_sq': 'real_var', 'return_pos_sq': 'real_var_pos', 'return_neg_sq': 'real_var_neg', 'return_merge': 'log_return'}, 
    inplace = True
)

# Add weekly and monthly RV
daily_port_df = daily_port_df.sort_values(by = ['permno', 'datetime'])
daily_port_df['real_var_weekly'] = daily_port_df.groupby(['permno'])['real_var'].transform(
    lambda x: x.rolling(5, min_periods = 5).sum())
daily_port_df['real_var_monthly'] = daily_port_df.groupby(['permno'])['real_var'].transform(
    lambda x: x.rolling(22, min_periods = 22).sum())
daily_port_df['log_return_weekly'] = daily_port_df.groupby(['permno'])['log_return'].transform(
    lambda x: x.rolling(5, min_periods = 5).sum())
daily_port_df['log_return_monthly'] = daily_port_df.groupby(['permno'])['log_return'].transform(
    lambda x: x.rolling(22, min_periods = 22).sum())

# Add other RV based values
daily_port_df['real_var_lead'] = daily_port_df.groupby(['permno'])['real_var'].shift(-1)
daily_port_df['log_return_lead'] = daily_port_df.groupby(['permno'])['log_return'].shift(-1)
daily_port_df['real_vol'] = np.sqrt(daily_port_df['real_var'])
daily_port_df['real_vol_pos'] = np.sqrt(daily_port_df['real_var_pos'])
daily_port_df['real_vol_neg'] = np.sqrt(daily_port_df['real_var_neg'])
daily_port_df['SJ'] = daily_port_df['real_var_pos'] - daily_port_df['real_var_neg']
daily_port_df['SJs'] = daily_port_df['real_vol_pos'] - daily_port_df['real_vol_neg']
daily_port_df['SJ_10'] = daily_port_df['SJ']*10

CPU times: user 516 ms, sys: 484 ms, total: 1 s
Wall time: 951 ms


## Save results

In [15]:
daily_data_df.to_feather('../../data/proc/all_daily.feather')
daily_port_df.to_feather('../../data/proc/portfolios_daily.feather')

# Visualize

## SPY

In [None]:
spy_daily_data_df = daily_data_df.query('permno == "84398"')
spy_monthly_data_df = monthly_data_df.query('permno == "84398"')

In [None]:
fig, axs = plt.subplots(figsize = (19,15), nrows = 2)

ax = axs[0]
spy_daily_data_df.plot('datetime', 'real_vol', color = 'k', ls = '--', alpha = 0.5, ax = ax)
spy_daily_data_df.plot('datetime', 'real_vol_pos', color = 'b', alpha = 0.5, ax = ax)
spy_daily_data_df.plot('datetime', 'real_vol_neg', color = 'r', alpha = 0.5, ax = ax)
spy_daily_data_df.plot('datetime', 'SJ_10', color = 'g', alpha = 0.5, ax = ax)

ax = axs[1]
spy_monthly_data_df.plot('datetime', 'real_vol', color = 'k', ls = '--', alpha = 0.5, ax = ax)
spy_monthly_data_df.plot('datetime', 'real_vol_pos', color = 'b', alpha = 0.5, ax = ax)
spy_monthly_data_df.plot('datetime', 'real_vol_neg', color = 'r', alpha = 0.5, ax = ax)
spy_monthly_data_df.plot('datetime', 'SJ_10', color = 'g', alpha = 0.5, ax = ax)

## BA

In [None]:
sample_daily_data_df = daily_data_df.query('permno == "19561"')
sample_monthly_data_df = monthly_data_df.query('permno == "19561"')

In [None]:
fig, axs = plt.subplots(figsize = (19,15), nrows = 2)

ax = axs[0]
sample_daily_data_df.plot('datetime', 'real_vol', color = 'k', ls = '--', alpha = 0.5, ax = ax)
sample_daily_data_df.plot('datetime', 'real_vol_pos', color = 'b', alpha = 0.5, ax = ax)
sample_daily_data_df.plot('datetime', 'real_vol_neg', color = 'r', alpha = 0.5, ax = ax)
sample_daily_data_df.plot('datetime', 'SJ_10', color = 'g', alpha = 0.5, ax = ax)

ax = axs[1]
sample_monthly_data_df.plot('datetime', 'real_vol', color = 'k', ls = '--', alpha = 0.5, ax = ax)
sample_monthly_data_df.plot('datetime', 'real_vol_pos', color = 'b', alpha = 0.5, ax = ax)
sample_monthly_data_df.plot('datetime', 'real_vol_neg', color = 'r', alpha = 0.5, ax = ax)
sample_monthly_data_df.plot('datetime', 'SJ_10', color = 'g', alpha = 0.5, ax = ax)

# SHAR Regs

## SPY

In [None]:
spy_daily_data_df = daily_data_df.query('permno == "84398"')
spy_daily_data_df = spy_daily_data_df.loc[spy_daily_data_df['datetime'].between('2002', '2019')]
fit = smf.ols('real_var_lead ~ SJ + real_var_weekly + real_var_monthly', 
       data = spy_daily_data_df).fit(cov_type = 'HAC', cov_kwds={"maxlags": int(0.75*len(spy_daily_data_df)**(1/3))})
fit.summary()

In [None]:
# Excluding 2020
spy_daily_data_df = daily_data_df.query('permno == "84398"')
spy_daily_data_df = spy_daily_data_df.loc[spy_daily_data_df['datetime'].between('2002', '2020')]
fit1 = smf.ols('real_var_lead ~ real_var + real_var_weekly + real_var_monthly', 
       data = spy_daily_data_df).fit(cov_type = 'HAC', cov_kwds={"maxlags": int(0.75*len(spy_daily_data_df)**(1/3))})
fit2 = smf.ols('real_var_lead ~ real_var_pos + real_var_neg + real_var_weekly + real_var_monthly', 
       data = spy_daily_data_df).fit(cov_type = 'HAC', cov_kwds={"maxlags": int(0.75*len(spy_daily_data_df)**(1/3))})
fit3 = smf.ols('real_var_lead ~ SJ + real_var_weekly + real_var_monthly', 
       data = spy_daily_data_df).fit(cov_type = 'HAC', cov_kwds={"maxlags": int(0.75*len(spy_daily_data_df)**(1/3))})

stargazer = Stargazer([fit1, fit2, fit3])
stargazer.covariate_order(['real_var', 'real_var_pos', 'real_var_neg', 'SJ', 'real_var_weekly', 'real_var_monthly'])
print(stargazer.render_latex()
      .replace('SJ ', '$SJ_t$')
      .replace('real_var_weekly', '$RV_{t:t-4}$')
      .replace('real_var_monthly', '$RV_{t:t-22}$')
      .replace('real_var_pos', '$RV_{t}^+$')
      .replace('real_var_neg', '$RV_{t}^-$')
      .replace('real_var', '$RV_{t}$')
)

In [None]:
# 2002-2020
spy_daily_data_df = daily_data_df.query('permno == "84398"')
spy_daily_data_df = spy_daily_data_df.loc[spy_daily_data_df['datetime'].between('2002', '2021')]
fit1 = smf.ols('real_var_lead ~ real_var + real_var_weekly + real_var_monthly', 
       data = spy_daily_data_df).fit(cov_type = 'HAC', cov_kwds={"maxlags": int(0.75*len(spy_daily_data_df)**(1/3))})
fit2 = smf.ols('real_var_lead ~ real_var_pos + real_var_neg + real_var_weekly + real_var_monthly', 
       data = spy_daily_data_df).fit(cov_type = 'HAC', cov_kwds={"maxlags": int(0.75*len(spy_daily_data_df)**(1/3))})
fit3 = smf.ols('real_var_lead ~ SJ + real_var_weekly + real_var_monthly', 
       data = spy_daily_data_df).fit(cov_type = 'HAC', cov_kwds={"maxlags": int(0.75*len(spy_daily_data_df)**(1/3))})

stargazer = Stargazer([fit1, fit2, fit3])
stargazer.covariate_order(['real_var', 'real_var_pos', 'real_var_neg', 'SJ', 'real_var_weekly', 'real_var_monthly'])
print(stargazer.render_latex()
      .replace('SJ ', '$SJ_t$')
      .replace('real_var_weekly', '$RV_{t:t-4}$')
      .replace('real_var_monthly', '$RV_{t:t-22}$')
      .replace('real_var_pos', '$RV_{t}^+$')
      .replace('real_var_neg', '$RV_{t}^-$')
      .replace('real_var', '$RV_{t}$')
)

In [None]:
# 2020 only
spy_daily_data_df = daily_data_df.query('permno == "84398"')
spy_daily_data_df = spy_daily_data_df.loc[spy_daily_data_df['datetime'].between('2020', '2021')]
fit1 = smf.ols('real_var_lead ~ real_var + real_var_weekly + real_var_monthly', 
       data = spy_daily_data_df).fit(cov_type = 'HAC', cov_kwds={"maxlags": int(0.75*len(spy_daily_data_df)**(1/3))})
fit2 = smf.ols('real_var_lead ~ real_var_pos + real_var_neg + real_var_weekly + real_var_monthly', 
       data = spy_daily_data_df).fit(cov_type = 'HAC', cov_kwds={"maxlags": int(0.75*len(spy_daily_data_df)**(1/3))})
fit3 = smf.ols('real_var_lead ~ SJ + real_var_weekly + real_var_monthly', 
       data = spy_daily_data_df).fit(cov_type = 'HAC', cov_kwds={"maxlags": int(0.75*len(spy_daily_data_df)**(1/3))})

stargazer = Stargazer([fit1, fit2, fit3])
stargazer.covariate_order(['real_var', 'real_var_pos', 'real_var_neg', 'SJ', 'real_var_weekly', 'real_var_monthly'])
print(stargazer.render_latex()
      .replace('SJ ', '$SJ_t$')
      .replace('real_var_weekly', '$RV_{t:t-4}$')
      .replace('real_var_monthly', '$RV_{t:t-22}$')
      .replace('real_var_pos', '$RV_{t}^+$')
      .replace('real_var_neg', '$RV_{t}^-$')
      .replace('real_var', '$RV_{t}$')
)

## Clusters

In [None]:
# Fits
fit_names = []
shar_fits = []

for i, row in cluster_class_df.iterrows():
    
    if not row['permno']:
        print('Skipping ', row['ticker'])
        continue
    
    # Run OLS
    sample_df = daily_data_df.query(f'permno == "{row["permno"]}"')
    sample_df = sample_df.loc[sample_df['datetime'].between('2002', '2020')]
    fit_names.append(row['ticker'])
    fit = smf.ols('real_var_lead ~ real_var_pos + real_var_neg + real_var_weekly + real_var_monthly', 
           data = sample_df).fit(cov_type = 'HAC', cov_kwds={"maxlags": int(0.75*len(sample_df)**(1/3))})
    shar_fits.append(fit)
    fit_sj = smf.ols('real_var_lead ~ SJ + real_var_weekly + real_var_monthly', 
           data = sample_df).fit(cov_type = 'HAC', cov_kwds={"maxlags": int(0.75*len(sample_df)**(1/3))})
    
    # Save results to dataframe
    cluster_class_df.loc[i, 'coeff_phi_pos_d'] = fit.params['real_var_pos']
    cluster_class_df.loc[i, 'coeff_phi_neg_d'] = fit.params['real_var_neg']
    cluster_class_df.loc[i, 't_phi_pos_d'] = np.abs(fit.tvalues['real_var_pos'])
    cluster_class_df.loc[i, 't_phi_neg_d'] = np.abs(fit.tvalues['real_var_neg'])
    cluster_class_df.loc[i, 'coeff_SJ'] = fit_sj.params['SJ']
    cluster_class_df.loc[i, 't_SJ'] = np.abs(fit_sj.tvalues['SJ'])
    
## SPY
spy_class_df = pd.DataFrame({'permno': '84398', 'ticker': 'SPY'}, index = [0])

# Run OLS
i = 0
row = spy_class_df.iloc[i]
sample_df = daily_data_df.query(f'permno == "{row["permno"]}"')
sample_df = sample_df.loc[sample_df['datetime'].between('2002', '2020')]
fit_names.append(row['ticker'])
fit = smf.ols('real_var_lead ~ real_var_pos + real_var_neg + real_var_weekly + real_var_monthly', 
       data = sample_df).fit(cov_type = 'HAC', cov_kwds={"maxlags": int(0.75*len(sample_df)**(1/3))})
shar_fits.append(fit)
fit_sj = smf.ols('real_var_lead ~ SJ + real_var_weekly + real_var_monthly', 
       data = sample_df).fit(cov_type = 'HAC', cov_kwds={"maxlags": int(0.75*len(sample_df)**(1/3))})

# Save results to dataframe
spy_class_df.loc[i, 'coeff_phi_pos_d'] = fit.params['real_var_pos']
spy_class_df.loc[i, 'coeff_phi_neg_d'] = fit.params['real_var_neg']
spy_class_df.loc[i, 't_phi_pos_d'] = np.abs(fit.tvalues['real_var_pos'])
spy_class_df.loc[i, 't_phi_neg_d'] = np.abs(fit.tvalues['real_var_neg'])
spy_class_df.loc[i, 'coeff_SJ'] = fit_sj.params['SJ']
spy_class_df.loc[i, 't_SJ'] = np.abs(fit_sj.tvalues['SJ'])

### Tables

In [None]:
stargazer = Stargazer(shar_fits)
stargazer.custom_columns(fit_names, [1]*14)
stargazer.show_model_numbers(False)
stargazer.covariate_order(['real_var_pos', 'real_var_neg', 'real_var_weekly', 'real_var_monthly'])
stargazer.significant_digits(2)
# stargazer.significance_levels([1e-999, 1e-999, 1e-999])
print(stargazer.render_latex()
      .replace('real_var_weekly', '$RV_{t:t-4}$')
      .replace('real_var_monthly', '$RV_{t:t-22}$')
      .replace('real_var_pos', '$RV_{t}^+$')
      .replace('real_var_neg', '$RV_{t}^-$')
)
stargazer

### Coeffs (SemiRV)

In [None]:
fig, ax = plt.subplots(figsize = (16,11))
p1 = cluster_class_df.plot.scatter('coeff_phi_pos_d', 'coeff_phi_neg_d', s = 100, ax= ax)
texts = []

# Cluster
for line in range(0,cluster_class_df.shape[0]):
     texts.append(p1.text(cluster_class_df.loc[line, 'coeff_phi_pos_d']+0.01, cluster_class_df.loc[line, 'coeff_phi_neg_d']+0.01, 
     cluster_class_df.loc[line, 'ticker'], horizontalalignment='left', weight = ('bold' if cluster_class_df.loc[line,'is_faang'] else 'normal'),
     size=14, color='black'))
        
# SPY
p2 = spy_class_df.plot.scatter('coeff_phi_pos_d', 'coeff_phi_neg_d', s = 150, ax= ax, color = 'tab:green')
texts.append(p2.text(spy_class_df.loc[0, 'coeff_phi_pos_d']+0.01, spy_class_df.loc[0, 'coeff_phi_neg_d']+0.01, 
spy_class_df.loc[0, 'ticker'], horizontalalignment='left', weight = 'bold', size=14, color='black'))  
    
# adjust_text(texts, force_text=0.05)
ax.set_xlim(-0.6, 0.5)
ax.set_ylim(-0.1, 1.3)
# ax.autoscale()
ax.axhline(0, lw = 2, ls = '--', color = 'gray')
ax.axvline(0, lw = 2, ls = '--', color = 'gray')
ax.plot([-100, 100], [-100, 100], lw = 2, ls = '--', color = 'darkgray')
ax.set_ylabel('Coefficient on $RV_t^-$ ($\\phi_D^{-}$)')
ax.set_xlabel('Coefficient on $RV_t^+$ ($\\phi_D^{+}$)')

plt.tight_layout()
plt.savefig('../../exhibits/semivar_coeffs.pdf')

### T-stats (SemiRV)

In [None]:
fig, ax = plt.subplots(figsize = (14,11))
p1 = cluster_class_df.plot.scatter('t_phi_pos_d', 't_phi_neg_d', s = 100, ax= ax)
texts = []

for line in range(0,cluster_class_df.shape[0]):
     texts.append(p1.text(cluster_class_df.loc[line, 't_phi_pos_d']+0.03, cluster_class_df.loc[line, 't_phi_neg_d'], 
     cluster_class_df.loc[line, 'ticker'], horizontalalignment='left', weight = ('bold' if cluster_class_df.loc[line,'is_faang'] else 'normal'),
     size='medium', color='black'))
        
# SPY
p2 = spy_class_df.plot.scatter('t_phi_pos_d', 't_phi_neg_d', s = 150, ax= ax, color = 'tab:green')
texts.append(p2.text(spy_class_df.loc[0, 't_phi_pos_d']+0.03, spy_class_df.loc[0, 't_phi_neg_d']+0.01, 
spy_class_df.loc[0, 'ticker'], horizontalalignment='left', weight = 'bold', size='medium', color='black'))  

# adjust_text(texts, force_text=0.01)
ax.set_xlim(0, 2.75)
ax.axhline(1.96, lw = 2, ls = '--', color = 'k', alpha = 0.5)
ax.axvline(1.96, lw = 2, ls = '--', color = 'k', alpha = 0.5)
ax.set_ylabel('T-Stat on Real SemiVar Negative ($\\phi_D^{-}$)')
ax.set_xlabel('T-Stat on Real SemiVar Positive ($\\phi_D^{+}}$)')
ax.text(1, 5, '$\\phi_D^{-}$ significant', horizontalalignment = 'center', verticalalignment = 'center' , color = 'tab:blue')
ax.text(2.3, 5, '$\{\\phi_D^{-}, \\phi_D^{+}\}$ significant', horizontalalignment = 'center', verticalalignment = 'center' , color = 'tab:blue')
ax.text(2.3, 1, '$\\phi_D^{+}$ significant', horizontalalignment = 'center', verticalalignment = 'center' , color = 'tab:blue')

plt.tight_layout()
plt.savefig('../../exhibits/semivar_tstats.pdf')

### Coeffs (SJV)

In [None]:
fig, ax = plt.subplots(figsize = (16,11))
p1 = cluster_class_df.plot.scatter('coeff_SJ', 't_SJ', s = 100, ax= ax)
texts = []

for line in range(0,cluster_class_df.shape[0]):
     texts.append(p1.text(cluster_class_df.loc[line, 'coeff_SJ']+0.01, cluster_class_df.loc[line, 't_SJ'], 
     cluster_class_df.loc[line, 'ticker'], horizontalalignment='left', weight = ('bold' if cluster_class_df.loc[line,'is_faang'] else 'normal'),
     size=14, color='black'))
        
# SPY
p2 = spy_class_df.plot.scatter('coeff_SJ', 't_SJ', s = 150, ax= ax, color = 'tab:green')
texts.append(p2.text(spy_class_df.loc[0, 'coeff_SJ']+0.01, spy_class_df.loc[0, 't_SJ']+0.01, 
spy_class_df.loc[0, 'ticker'], horizontalalignment='left', weight = 'bold', size='medium', color='black'))  

adjust_text(texts, force_text=0.05)
ax.axhline(1.96, lw = 2, ls = '-.', color = 'g')
ax.axvline(0, lw = 2, ls = '--', color = 'gray')
ax.set_ylabel('T-stat on Signed Jump Variation')
ax.set_xlabel('Coeff on Signed Jump Variation')
# ax.set_title('Coeff versus T-stat For Signed Jump Variation', fontsize = 20)

plt.tight_layout()
plt.savefig('../../exhibits/semivar_sj_results.pdf')