In [2]:
pip install yfinance --upgrade


Collecting yfinance
  Downloading yfinance-0.2.66-py2.py3-none-any.whl.metadata (6.0 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.12.tar.gz (19 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.7-py3-none-any.whl.metadata (23 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.18.3.tar.gz (3.0 MB)
     ---------------------------------------- 0.0/3.0 MB ? eta -:--:--
     --- ------------------------------------ 0.3/3.0 MB ? eta -:--:--
     ------------- -------------------------- 1.0/3.0 MB 2.8 MB/s eta 0:00:01
     -------------------- ------------------- 1.6/3.0 MB 2.7 MB/s eta 0:00:01
     ------------------------------- -------- 2.4/3.0 MB 3.2 MB/s eta 0:00:01
     ---------------------------------------- 3.0/3.0 MB 3.5 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dep


[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import yfinance as yf
import pandas as pd
import os

# ------------------------------------------------------------
# Load dataset
# ------------------------------------------------------------
df = pd.read_parquet("sampled_35k_embedded.parquet")

df["Date"] = pd.to_datetime(df["Date"])

tickers = sorted(df["Stock_symbol"].dropna().unique().tolist())
start_date = df["Date"].min().strftime("%Y-%m-%d")
end_date = df["Date"].max().strftime("%Y-%m-%d")

print(f"Tickers: {len(tickers)}")
print(f"Date range: {start_date} → {end_date}")

# ------------------------------------------------------------
# Download prices
# ------------------------------------------------------------
tickers_str = " ".join(tickers)

prices = yf.download(
    tickers_str,
    start=start_date,
    end=end_date,
    auto_adjust=True,
    group_by="ticker",
    threads=True,
    progress=True
)

# ------------------------------------------------------------
# Save per-ticker parquet files
# ------------------------------------------------------------
os.makedirs("prices_chunks", exist_ok=True)

for t in tickers:
    if t not in prices:
        continue

    df_t = prices[t].dropna(how="all")
    if df_t.empty:
        continue

    df_t["Stock_symbol"] = t
    df_t.reset_index(inplace=True)
    df_t.to_parquet(f"prices_chunks/{t}.parquet")

print("✅ Price data saved for all available tickers.")


Tickers: 600
Date range: 2009-04-08 → 2023-12-16


[*                      3%                       ]  17 of 600 completedHTTP Error 404: {"quoteSummary":{"result":null,"error":{"code":"Not Found","description":"Quote not found for symbol: BHLB"}}}
[*********************100%***********************]  600 of 600 completed

61 Failed downloads:
['BHLB', 'SASR', 'X', 'ATSG', 'SOLO', 'INFN', 'COOP', 'YY', 'ACRX', 'CARA', 'CEI', 'DLA', 'MORF', 'PZC', 'SAVE', 'CPE', 'APDN', 'PMF', 'ENZ', 'PMX', 'CBAY', 'CEIX', 'YNDX', 'AINC', 'MRNS', 'GOL', 'ADES', 'SOI', 'PRMW', 'EVBN', 'SWI', 'TGH', 'NVTA', 'SLCA', 'ESGR', 'HARP', 'FIF', 'MRTX', 'DADA', 'CDMO', 'PEAK', 'EBIX', 'SP', 'FLIC', 'FUV', 'PFC', 'ARCH', 'SPTN', 'NEPT', 'TEDU', 'CMRX', 'PHT', 'INFI', 'AE']: YFTzMissingError('possibly delisted; no timezone found')
['TWOU', 'SPI', 'AFMD', 'CSSE']: YFPricesMissingError('possibly delisted; no price data found  (1d 2009-04-08 -> 2023-12-16)')
['NRGU', 'VRM']: YFPricesMissingError('possibly delisted; no price data found  (1d 2009-04-08 -> 2023-12-16) (Yah

✅ Price data saved for all available tickers.


In [6]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import glob

def normalize_prices(df):
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [c[0] for c in df.columns]

    keep = ["Date", "Open", "High", "Low", "Close", "Volume", "Stock_symbol"]
    df = df[keep]

    df["Date"] = pd.to_datetime(df["Date"])

    df[["Open", "High", "Low", "Close"]] = df[
        ["Open", "High", "Low", "Close"]
    ].astype("float32")

    df["Volume"] = df["Volume"].astype("int64")

    return df


price_files = glob.glob("prices_chunks/*.parquet")

writer = None

for f in price_files:
    df = pd.read_parquet(f)
    df = normalize_prices(df)

    table = pa.Table.from_pandas(
        df,
        preserve_index=False,
        nthreads=1 
    )

    if writer is None:
        writer = pq.ParquetWriter(
            "all_prices.parquet",
            table.schema,
            compression="snappy"
        )

    writer.write_table(table)

writer.close()

print("all_prices.parquet written successfully")


all_prices.parquet written successfully


In [None]:
import pandas as pd
import numpy as np

prices_df = pd.read_parquet("all_prices.parquet")

prices_df['Date'] = pd.to_datetime(prices_df['Date'])
prices_df = prices_df.sort_values(['Stock_symbol', 'Date'])

prices_df.head()


Unnamed: 0,Date,Open,High,Low,Close,Volume,Stock_symbol
0,2009-04-08,34.503216,35.364933,33.874161,35.054714,1872200,AAP
1,2009-04-09,35.606201,36.623032,35.606201,36.44207,2094600,AAP
2,2009-04-13,36.235264,36.321438,34.951305,35.520039,1845400,AAP
3,2009-04-14,35.235676,35.425255,34.218845,35.00301,1608900,AAP
4,2009-04-15,34.899593,36.330048,34.546288,36.252491,2367600,AAP


In [8]:
# Log returns
prices_df['Return'] = (
    np.log(prices_df['Close'])
    - np.log(prices_df.groupby('Stock_symbol')['Close'].shift(1))
)

# Next-day return (THIS is what SSD predicts)
prices_df['Return_t+1'] = (
    prices_df.groupby('Stock_symbol')['Return'].shift(-1)
)

In [9]:
prices_df['Volatility'] = (
    prices_df
    .groupby('Stock_symbol')['Return']
    .rolling(window=5)
    .std()
    .reset_index(level=0, drop=True)
)

In [10]:
market_df = prices_df[
    ['Date', 'Stock_symbol', 'Return', 'Return_t+1', 'Volatility', 'Volume']
].copy()

market_df = market_df.dropna()

market_df.to_parquet("market_data.parquet")
print("market_data.parquet saved")

market_data.parquet saved


In [3]:
import pyarrow.dataset as ds

# Open SSD parquet as a PyArrow dataset
dataset = ds.dataset("ssd_final_variants.parquet", format="parquet")


In [5]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats.mstats import winsorize
from linearmodels.panel import PanelOLS



In [7]:
# Sort by stock & date
df = df.sort_values(['Stock_symbol', 'Date']).reset_index(drop=True)

# Clean volatility
df['Vol_clean'] = winsorize(df['Volatility'], limits=[0.01, 0.01])

# Lags
for lag in range(1, 6):
    df[f'Lag_Vol_{lag}'] = df.groupby('Stock_symbol')['Vol_clean'].shift(lag)

df['Lag_Vol_mean5'] = df[[f'Lag_Vol_{i}' for i in range(1, 6)]].mean(axis=1)

# Impulse response / predictive
df['Vol_t'] = df['Vol_clean']
df['Vol_t_plus_1'] = df.groupby('Stock_symbol')['Vol_clean'].shift(-1)
df['Vol_t_minus_2'] = df.groupby('Stock_symbol')['Vol_clean'].shift(2)


In [12]:
pip install dask[dataframe]

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting dask[dataframe]
  Downloading dask-2025.12.0-py3-none-any.whl.metadata (3.8 kB)
Collecting cloudpickle>=3.0.0 (from dask[dataframe])
  Downloading cloudpickle-3.1.2-py3-none-any.whl.metadata (7.1 kB)
Collecting partd>=1.4.0 (from dask[dataframe])
  Downloading partd-1.4.2-py3-none-any.whl.metadata (4.6 kB)
Collecting toolz>=0.12.0 (from dask[dataframe])
  Downloading toolz-1.1.0-py3-none-any.whl.metadata (5.1 kB)
Collecting locket (from partd>=1.4.0->dask[dataframe])
  Downloading locket-1.0.0-py2.py3-none-any.whl.metadata (2.8 kB)
Downloading cloudpickle-3.1.2-py3-none-any.whl (22 kB)
Downloading partd-1.4.2-py3-none-any.whl (18 kB)
Downloading toolz-1.1.0-py3-none-any.whl (58 kB)
Downloading dask-2025.12.0-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.5 MB ? eta -:--:--
   -------------- --------------------

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy.stats.mstats import winsorize
from linearmodels.panel import PanelOLS

# ----------------------------
# 1️⃣ Load market data
# ----------------------------
market_df = pd.read_parquet("market_data.parquet")
market_df['Date'] = pd.to_datetime(market_df['Date'])

# ----------------------------
# 2️⃣ Load SSD variants
# ----------------------------
ssd_df = pd.read_parquet("ssd_final_variants.parquet")
ssd_df['Date'] = pd.to_datetime(ssd_df['Date'])

# SSD metrics to loop over
ssd_metrics = ['SSD_cosine', 'SSD_l2', 'SSD_angular']

# Store results
all_results = []

# ----------------------------
# 3️⃣ Loop over SSD variants
# ----------------------------
for metric in ssd_metrics:
    print(f"\nProcessing SSD metric: {metric}")

    # Keep only needed columns
    df = ssd_df[['Stock_symbol', 'Date', 'EMA_alpha', metric]].copy()
    df = df.rename(columns={metric: 'SSD'})

    # Merge with market data
    df = df.merge(market_df, on=['Stock_symbol', 'Date'], how='inner')

    # ----------------------------
    # Sorting & cleaning
    # ----------------------------
    df = df.sort_values(['Stock_symbol', 'Date']).reset_index(drop=True)

    df['SSD_clean'] = winsorize(df['SSD'], limits=[0.01, 0.01])
    df['Vol_clean'] = winsorize(df['Volatility'], limits=[0.01, 0.01])

    # ----------------------------
    # Lagged volatility (mean of last 5 days)
    # ----------------------------
    for lag in range(1, 6):
        df[f'Lag_Vol_{lag}'] = df.groupby('Stock_symbol')['Vol_clean'].shift(lag)

    df['Lag_Vol_mean5'] = df[[f'Lag_Vol_{i}' for i in range(1, 6)]].mean(axis=1)

    # ----------------------------
    # News volume
    # ----------------------------
    news_counts = (
        df.groupby(['Stock_symbol', 'Date'])
          .size()
          .reset_index(name='News_Count')
    )

    df = df.merge(news_counts, on=['Stock_symbol', 'Date'], how='left')
    df['Log_News_Count'] = np.log1p(df['News_Count'])

    # ----------------------------
    # Orthogonalize SSD
    # ----------------------------
    X_ortho = sm.add_constant(df[['Log_News_Count']])
    y_ortho = df['SSD_clean']

    mask = X_ortho.notna().all(axis=1) & y_ortho.notna()

    ortho_model = sm.OLS(y_ortho[mask], X_ortho.loc[mask]).fit()

    df.loc[mask, 'SSD_Pure_raw'] = ortho_model.resid
    df['SSD_Pure'] = winsorize(df['SSD_Pure_raw'], limits=[0.01, 0.01])

    # ----------------------------
    # High-shock construction
    # ----------------------------
    shock_cut = df['SSD_clean'].quantile(0.90)
    df['High_Shock'] = (df['SSD_clean'] >= shock_cut).astype(int)

    df['SSD_Pure_centered'] = df['SSD_Pure'] - df['SSD_Pure'].mean()
    df['SSD_HighShock'] = df['SSD_Pure_centered'] * df['High_Shock']

    # ----------------------------
    # Volatility leads & lags
    # ----------------------------
    df['Vol_t'] = df['Vol_clean']
    df['Vol_t_plus_1'] = df.groupby('Stock_symbol')['Vol_clean'].shift(-1)
    df['Vol_t_minus_2'] = df.groupby('Stock_symbol')['Vol_clean'].shift(2)

    # ----------------------------
    # Panel setup
    # ----------------------------
    panel_df = df.set_index(['Stock_symbol', 'Date'])

    # ----------------------------
    # Panel regressions
    # ----------------------------
    try:
        # Placebo
        model_placebo = PanelOLS.from_formula(
            'Vol_t_minus_2 ~ SSD_Pure_centered + Lag_Vol_mean5 + Log_News_Count + EntityEffects + TimeEffects',
            data=panel_df
        ).fit(cov_type='clustered', cluster_entity=True, cluster_time=True)

        # Contemporaneous
        model_contemp = PanelOLS.from_formula(
            'Vol_t ~ SSD_Pure_centered + Lag_Vol_mean5 + Log_News_Count + EntityEffects + TimeEffects',
            data=panel_df
        ).fit(cov_type='clustered', cluster_entity=True, cluster_time=True)

        # Predictive
        model_predictive = PanelOLS.from_formula(
            'Vol_t_plus_1 ~ SSD_Pure_centered + Lag_Vol_mean5 + Log_News_Count + EntityEffects + TimeEffects',
            data=panel_df
        ).fit(cov_type='clustered', cluster_entity=True, cluster_time=True)

        # Non-linearity
        model_nl = PanelOLS.from_formula(
            'Vol_t_plus_1 ~ SSD_Pure_centered + SSD_HighShock + Lag_Vol_mean5 + Log_News_Count + EntityEffects + TimeEffects',
            data=panel_df
        ).fit(cov_type='clustered', cluster_entity=True, cluster_time=True)

        # ----------------------------
        # Store results
        # ----------------------------
        ssd_std = df['SSD_Pure_centered'].std()

        all_results.append({
            'SSD_metric': metric,
            'Placebo_coef': model_placebo.params['SSD_Pure_centered'],
            'Contemp_coef': model_contemp.params['SSD_Pure_centered'],
            'Predictive_coef': model_predictive.params['SSD_Pure_centered'],
            'NonLinear_coef': model_nl.params.get('SSD_HighShock', np.nan),
            'Predictive_SD_impact': model_predictive.params['SSD_Pure_centered'] * ssd_std
        })

    except Exception as e:
        print(f"❌ Error for metric {metric}: {e}")

# ----------------------------
# 4️⃣ Final results
# ----------------------------
results_df = pd.DataFrame(all_results)

print("\nAll SSD metrics results:")
print(results_df)

# ----------------------------
# 5️⃣ Save to D drive
# ----------------------------
output_csv = r"D:\ssd_panel_results.csv"
output_parquet = r"D:\ssd_panel_results.parquet"

results_df.to_csv(output_csv, index=False)
results_df.to_parquet(output_parquet, index=False)

print(f"\n✅ Results saved to:")
print(output_csv)
print(output_parquet)



Processing SSD metric: SSD_cosine


  arr.partition(
Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)
Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)
Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)
Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)



Processing SSD metric: SSD_l2


  arr.partition(
Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)
Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)
Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)
Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)



Processing SSD metric: SSD_angular


  arr.partition(
Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)
Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)
Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)
Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)



All SSD metrics results:
    SSD_metric  Placebo_coef  Contemp_coef  Predictive_coef  NonLinear_coef  \
0   SSD_cosine     -0.000849     -0.005418        -0.005446        0.003954   
1       SSD_l2     -0.000741     -0.004650        -0.004768        0.002353   
2  SSD_angular     -0.001772     -0.011716        -0.011950        0.006152   

   Predictive_SD_impact  
0             -0.000743  
1             -0.000767  
2             -0.000780  

✅ Results saved to:
D:\ssd_panel_results.csv
D:\ssd_panel_results.parquet
