In [2]:
pip install yfinance --upgrade


Collecting yfinance
  Downloading yfinance-0.2.66-py2.py3-none-any.whl.metadata (6.0 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.12.tar.gz (19 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.7-py3-none-any.whl.metadata (23 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.18.3.tar.gz (3.0 MB)
     ---------------------------------------- 0.0/3.0 MB ? eta -:--:--
     --- ------------------------------------ 0.3/3.0 MB ? eta -:--:--
     ------------- -------------------------- 1.0/3.0 MB 2.8 MB/s eta 0:00:01
     -------------------- ------------------- 1.6/3.0 MB 2.7 MB/s eta 0:00:01
     ------------------------------- -------- 2.4/3.0 MB 3.2 MB/s eta 0:00:01
     ---------------------------------------- 3.0/3.0 MB 3.5 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dep


[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import yfinance as yf
import pandas as pd
import os

# ------------------------------------------------------------
# Load dataset
# ------------------------------------------------------------
df = pd.read_parquet("sampled_35k_embedded.parquet")

df["Date"] = pd.to_datetime(df["Date"])

tickers = sorted(df["Stock_symbol"].dropna().unique().tolist())
start_date = df["Date"].min().strftime("%Y-%m-%d")
end_date = df["Date"].max().strftime("%Y-%m-%d")

print(f"Tickers: {len(tickers)}")
print(f"Date range: {start_date} → {end_date}")

# ------------------------------------------------------------
# Download prices
# ------------------------------------------------------------
tickers_str = " ".join(tickers)

prices = yf.download(
    tickers_str,
    start=start_date,
    end=end_date,
    auto_adjust=True,
    group_by="ticker",
    threads=True,
    progress=True
)

# ------------------------------------------------------------
# Save per-ticker parquet files
# ------------------------------------------------------------
os.makedirs("prices_chunks", exist_ok=True)

for t in tickers:
    if t not in prices:
        continue

    df_t = prices[t].dropna(how="all")
    if df_t.empty:
        continue

    df_t["Stock_symbol"] = t
    df_t.reset_index(inplace=True)
    df_t.to_parquet(f"prices_chunks/{t}.parquet")

print("✅ Price data saved for all available tickers.")


Tickers: 600
Date range: 2009-04-08 → 2023-12-16


[*                      3%                       ]  17 of 600 completedHTTP Error 404: {"quoteSummary":{"result":null,"error":{"code":"Not Found","description":"Quote not found for symbol: BHLB"}}}
[*********************100%***********************]  600 of 600 completed

61 Failed downloads:
['BHLB', 'SASR', 'X', 'ATSG', 'SOLO', 'INFN', 'COOP', 'YY', 'ACRX', 'CARA', 'CEI', 'DLA', 'MORF', 'PZC', 'SAVE', 'CPE', 'APDN', 'PMF', 'ENZ', 'PMX', 'CBAY', 'CEIX', 'YNDX', 'AINC', 'MRNS', 'GOL', 'ADES', 'SOI', 'PRMW', 'EVBN', 'SWI', 'TGH', 'NVTA', 'SLCA', 'ESGR', 'HARP', 'FIF', 'MRTX', 'DADA', 'CDMO', 'PEAK', 'EBIX', 'SP', 'FLIC', 'FUV', 'PFC', 'ARCH', 'SPTN', 'NEPT', 'TEDU', 'CMRX', 'PHT', 'INFI', 'AE']: YFTzMissingError('possibly delisted; no timezone found')
['TWOU', 'SPI', 'AFMD', 'CSSE']: YFPricesMissingError('possibly delisted; no price data found  (1d 2009-04-08 -> 2023-12-16)')
['NRGU', 'VRM']: YFPricesMissingError('possibly delisted; no price data found  (1d 2009-04-08 -> 2023-12-16) (Yah

✅ Price data saved for all available tickers.


In [6]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import glob

def normalize_prices(df):
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [c[0] for c in df.columns]

    keep = ["Date", "Open", "High", "Low", "Close", "Volume", "Stock_symbol"]
    df = df[keep]

    df["Date"] = pd.to_datetime(df["Date"])

    df[["Open", "High", "Low", "Close"]] = df[
        ["Open", "High", "Low", "Close"]
    ].astype("float32")

    df["Volume"] = df["Volume"].astype("int64")

    return df


price_files = glob.glob("prices_chunks/*.parquet")

writer = None

for f in price_files:
    df = pd.read_parquet(f)
    df = normalize_prices(df)

    table = pa.Table.from_pandas(
        df,
        preserve_index=False,
        nthreads=1 
    )

    if writer is None:
        writer = pq.ParquetWriter(
            "all_prices.parquet",
            table.schema,
            compression="snappy"
        )

    writer.write_table(table)

writer.close()

print("all_prices.parquet written successfully")


all_prices.parquet written successfully


In [7]:
import pandas as pd
import numpy as np

prices_df = pd.read_parquet("all_prices.parquet")

prices_df['Date'] = pd.to_datetime(prices_df['Date'])
prices_df = prices_df.sort_values(['Stock_symbol', 'Date'])

prices_df.head()


Unnamed: 0,Date,Open,High,Low,Close,Volume,Stock_symbol
0,2009-04-08,34.503216,35.364933,33.874161,35.054714,1872200,AAP
1,2009-04-09,35.606201,36.623032,35.606201,36.44207,2094600,AAP
2,2009-04-13,36.235264,36.321438,34.951305,35.520039,1845400,AAP
3,2009-04-14,35.235676,35.425255,34.218845,35.00301,1608900,AAP
4,2009-04-15,34.899593,36.330048,34.546288,36.252491,2367600,AAP


In [8]:
# Log returns
prices_df['Return'] = (
    np.log(prices_df['Close'])
    - np.log(prices_df.groupby('Stock_symbol')['Close'].shift(1))
)

# Next-day return (THIS is what SSD predicts)
prices_df['Return_t+1'] = (
    prices_df.groupby('Stock_symbol')['Return'].shift(-1)
)

In [9]:
prices_df['Volatility'] = (
    prices_df
    .groupby('Stock_symbol')['Return']
    .rolling(window=5)
    .std()
    .reset_index(level=0, drop=True)
)

In [10]:
market_df = prices_df[
    ['Date', 'Stock_symbol', 'Return', 'Return_t+1', 'Volatility', 'Volume']
].copy()

market_df = market_df.dropna()

market_df.to_parquet("market_data.parquet")
print("market_data.parquet saved")

market_data.parquet saved


In [23]:
import pandas as pd

market_df = pd.read_parquet("market_data.parquet")
ssd_df = pd.read_parquet("ssd_final.parquet")
ssd_df['Date'] = pd.to_datetime(ssd_df['Date'])

final_df = pd.merge(
    ssd_df,
    market_df,
    on=['Stock_symbol', 'Date'],
    how='inner'
)

print("Final dataset shape:", final_df.shape)
final_df.head()

Final dataset shape: (190878, 9)


Unnamed: 0,Stock_symbol,Date,E_t,mu_t_minus_1,SSD,Return,Return_t+1,Volatility,Volume
0,AAP,2010-04-20,"[-0.0077822395, 0.021344615, -0.024955302, -0....","[-0.0077822395, 0.021344615, -0.024955302, -0....",-1.192093e-07,0.000225,0.011406,0.033483,1169500
1,AAP,2010-05-20,"[-0.0477138, 0.039717715, -0.042229753, -0.009...","[-0.0077822395, 0.021344615, -0.024955302, -0....",0.4177352,0.06102,0.00975,0.02706,4362400
2,AAP,2010-12-14,"[-0.0006515795, 0.060737956, -0.019967664, -0....","[-0.015854474, 0.03892529, -0.029906362, -0.02...",0.5126962,0.000441,-0.003681,0.002946,458000
3,AAP,2011-02-22,"[-0.014063652, 0.075925656, -0.03710558, -0.03...","[-0.012813896, 0.043287825, -0.027918624, -0.0...",0.5630123,-0.015507,-0.011954,0.0145,1313500
4,AAP,2011-05-18,"[-0.06610438, 0.06828314, -0.014800495, -0.019...","[-0.013063847, 0.049815394, -0.029756015, -0.0...",0.4241759,0.013392,-0.106997,0.024694,1473900


In [24]:
import numpy as np
import statsmodels.api as sm
from scipy.stats.mstats import winsorize

# =====================================================
# 0. SORT & BASIC CLEANING
# =====================================================
final_df = final_df.sort_values(
    ['Stock_symbol', 'Date']
).reset_index(drop=True)

# Winsorize key variables (1% each tail)
final_df['SSD_clean'] = winsorize(final_df['SSD'], limits=[0.01, 0.01])
final_df['Vol_clean'] = winsorize(final_df['Volatility'], limits=[0.01, 0.01])

# Lagged volatility (persistence control)
final_df['Lag_Vol'] = (
    final_df.groupby('Stock_symbol')['Vol_clean']
            .shift(1)
)

# News volume control
news_counts = (
    final_df.groupby(['Stock_symbol', 'Date'])
            .size()
            .reset_index(name='News_Count')
)
final_df = final_df.merge(
    news_counts,
    on=['Stock_symbol', 'Date'],
    how='left'
)
final_df['Log_News_Count'] = np.log1p(final_df['News_Count'])

# =====================================================
# 1. STEP A — ORTHOGONALIZE SSD (PURE SEMANTIC SHOCK)
# =====================================================
ortho_controls = ['Log_News_Count']
X_ortho = sm.add_constant(final_df[ortho_controls])
y_ortho = final_df['SSD_clean']

mask_ortho = X_ortho.notna().all(axis=1) & y_ortho.notna()
ortho_model = sm.OLS(
    y_ortho[mask_ortho],
    X_ortho.loc[mask_ortho]
).fit()

final_df.loc[mask_ortho, 'SSD_Pure_raw'] = ortho_model.resid
final_df['SSD_Pure'] = winsorize(final_df['SSD_Pure_raw'], limits=[0.01, 0.01])

print("✅ STEP A complete: SSD orthogonalized (pure semantic shock)")

# =====================================================
# 2. STEP B — EXTREME SHOCK DUMMY (TOP 10%)
# =====================================================
shock_cut = final_df['SSD_clean'].quantile(0.90)
final_df['High_Shock'] = (final_df['SSD_clean'] >= shock_cut).astype(int)

final_df['SSD_Pure_centered'] = (
    final_df['SSD_Pure'] - final_df['SSD_Pure'].mean()
)

final_df['SSD_HighShock'] = (
    final_df['SSD_Pure_centered'] * final_df['High_Shock']
)

print("✅ STEP B complete: High_Shock dummy and interaction created")

# =====================================================
# 3. STEP C — IMPULSE RESPONSE TIMING
# =====================================================
final_df['Vol_t']         = final_df['Vol_clean']
final_df['Vol_t_plus_1']  = final_df.groupby('Stock_symbol')['Vol_clean'].shift(-1)

# ✅ Proper placebo (pre-determined volatility)
final_df['Vol_t_minus_2'] = final_df.groupby('Stock_symbol')['Vol_clean'].shift(2)

# =====================================================
# 4. STEP D — REGRESSIONS
# =====================================================
def run_reg(dep_var, use_interaction=False):
    cols = ['SSD_Pure_centered', 'Lag_Vol', 'Log_News_Count']
    if use_interaction:
        cols.append('SSD_HighShock')
    X = sm.add_constant(final_df[cols])
    y = final_df[dep_var]
    mask = X.notna().all(axis=1) & y.notna()
    return sm.OLS(y[mask], X.loc[mask]).fit(cov_type='HC3')

# ✅ Placebo: SSD_t → Vol_{t-2}
model_placebo = run_reg('Vol_t_minus_2')

# Contemporaneous
model_contemp = run_reg('Vol_t')

# Predictive
model_predictive = run_reg('Vol_t_plus_1')

# Non-linearity
model_nl = run_reg('Vol_t_plus_1', use_interaction=True)

# =====================================================
# 5. OUTPUT
# =====================================================
print("\n--- PLACEBO (Vol_{t-2}) ---")
print(model_placebo.summary())

print("\n--- CONTEMPORANEOUS (Vol_t) ---")
print(model_contemp.summary())

print("\n--- PREDICTIVE (Vol_{t+1}) ---")
print(model_predictive.summary())

print("\n--- NON-LINEARITY (High Shock Interaction) ---")
print(model_nl.summary())

# =====================================================
# 6. DIAGNOSTICS
# =====================================================
print("\nDiagnostics (Predictive t+1 model):")
print("Durbin-Watson:",
      sm.stats.stattools.durbin_watson(model_predictive.resid))
print("Skew:", model_predictive.resid.skew())
print("Kurtosis:", model_predictive.resid.kurtosis())

# =====================================================
# 7. ECONOMIC MAGNITUDE
# =====================================================
ssd_std = final_df['SSD_Pure_centered'].std()
impact_main = model_predictive.params['SSD_Pure_centered'] * ssd_std

print("\nEconomic Magnitude:")
print(f"1 SD SSD shock → Δ Vol_(t+1): {impact_main:.5f}")

if 'SSD_HighShock' in model_nl.params:
    impact_extreme = model_nl.params['SSD_HighShock'] * ssd_std
    print(f"Extreme shock add-on: {impact_extreme:.5f}")


✅ STEP A complete: SSD orthogonalized (pure semantic shock)
✅ STEP B complete: High_Shock dummy and interaction created


  arr.partition(



--- PLACEBO (Vol_{t-2}) ---
                            OLS Regression Results                            
Dep. Variable:          Vol_t_minus_2   R-squared:                       0.553
Model:                            OLS   Adj. R-squared:                  0.553
Method:                 Least Squares   F-statistic:                 2.798e+04
Date:                Thu, 25 Dec 2025   Prob (F-statistic):               0.00
Time:                        21:19:10   Log-Likelihood:             5.6535e+05
No. Observations:              189800   AIC:                        -1.131e+06
Df Residuals:                  189797   BIC:                        -1.131e+06
Df Model:                           2                                         
Covariance Type:                  HC3                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
SSD_Pure_