# WQ IV Momentum Data Loader

Builds a unified dataset with underlying returns and ~60D call option IV so we can reproduce the Brain-inspired IV momentum signal.


In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

pd.set_option("display.max_columns", 50)

PROJECT_ROOT = Path("/Users/nikhileshbelulkar/Documents/saturate_the_hypersphere/smile_smirks")
RAW_DIR = PROJECT_ROOT / "raw_data"
PROCESSED_DIR = PROJECT_ROOT / "processed_data"
OUTPUT_PATH = PROCESSED_DIR / "wq_iv_momentum_base.parquet"

PROJECT_ROOT, RAW_DIR, PROCESSED_DIR


(PosixPath('/Users/nikhileshbelulkar/Documents/saturate_the_hypersphere/smile_smirks'),
 PosixPath('/Users/nikhileshbelulkar/Documents/saturate_the_hypersphere/smile_smirks/raw_data'),
 PosixPath('/Users/nikhileshbelulkar/Documents/saturate_the_hypersphere/smile_smirks/processed_data'))

In [2]:
equity_cols = ["PERMNO", "date", "TICKER", "PRC", "RET"]
equity = (
    pd.read_csv(
        RAW_DIR / "equity_returns.csv",
        usecols=equity_cols,
        parse_dates=["date"],
        dtype={"PERMNO": "int32", "TICKER": "category"}
    )
    .rename(columns={"PERMNO": "permno", "TICKER": "ticker", "PRC": "close", "RET": "ret"})
)

equity["close"] = pd.to_numeric(equity["close"], errors="coerce").astype("float32")
equity["ret"] = pd.to_numeric(equity["ret"], errors="coerce").astype("float32")

equity = (
    equity
    .sort_values(["permno", "date"])
    .reset_index(drop=True)
)

equity.head()


  pd.read_csv(


Unnamed: 0,permno,date,ticker,close,ret
0,10026,2019-01-02,JJSF,141.0,-0.024829
1,10026,2019-01-03,JJSF,143.020004,0.014326
2,10026,2019-01-04,JJSF,144.839996,0.012725
3,10026,2019-01-07,JJSF,145.410004,0.003935
4,10026,2019-01-08,JJSF,148.699997,0.022626


In [3]:
option_cols = [
    "secid", "date", "exdate", "cp_flag", "strike_price", "volume", "open_interest",
    "impl_volatility", "moneyness", "tte"
]
options = pd.read_csv(
    RAW_DIR / "options_data.csv",
    usecols=option_cols,
    parse_dates=["date", "exdate"],
    dtype={
        "secid": "int32",
        "cp_flag": "category",
        "strike_price": "float32",
        "volume": "float32",
        "open_interest": "float32",
        "impl_volatility": "float32",
        "moneyness": "float32",
        "tte": "float32"
    }
)

call_band = (
    options
    .loc[(options["cp_flag"] == "C") & options["impl_volatility"].notna()]
    .loc[lambda df: df["tte"].between(55, 65)]
    .copy()
)
call_band["dte_distance"] = (call_band["tte"] - 60).abs()

call_60 = (
    call_band.sort_values(["secid", "date", "dte_distance"])
    .drop_duplicates(["secid", "date"], keep="first")
    .rename(columns={"impl_volatility": "iv_call60", "tte": "dte"})
)

call_60 = call_60[["secid", "date", "dte", "iv_call60", "volume", "open_interest", "moneyness", "strike_price"]]
call_60.head()


Unnamed: 0,secid,date,dte,iv_call60,volume,open_interest,moneyness,strike_price
6,5594,2021-10-21,57.0,0.337287,0.0,40.0,1.131222,12500.0
53,5594,2021-11-22,60.0,0.398916,0.0,31.0,1.008878,12500.0
57,5594,2021-11-23,59.0,0.390969,0.0,31.0,1.035626,12500.0
62,5594,2021-11-24,58.0,0.368718,0.0,31.0,1.044277,12500.0
67,5594,2021-11-26,56.0,0.396545,0.0,31.0,1.065644,12500.0


In [4]:
mapping = pd.read_csv(
    RAW_DIR / "permno_secid_mapping.csv",
    parse_dates=["sdate", "edate"],
    dtype={"secid": "int32", "PERMNO": "float32"}
).rename(columns={"PERMNO": "permno"})

merged_opts = call_60.merge(mapping, on="secid", how="left")
valid_window = (
    merged_opts["sdate"].isna().fillna(False) | (merged_opts["date"] >= merged_opts["sdate"])
) & (
    merged_opts["edate"].isna().fillna(False) | (merged_opts["date"] <= merged_opts["edate"])
)
clean_opts = (
    merged_opts.loc[valid_window]
    .drop(columns=["sdate", "edate"])
    .dropna(subset=["permno"])
    .astype({"permno": "int32"})
    .drop_duplicates(["permno", "date"])
    .rename(columns={"volume": "opt_volume", "open_interest": "opt_open_interest"})
)

clean_opts.head()


Unnamed: 0,secid,date,dte,iv_call60,opt_volume,opt_open_interest,moneyness,strike_price,permno
0,5594,2021-10-21,57.0,0.337287,0.0,40.0,1.131222,12500.0,52250
1,5594,2021-11-22,60.0,0.398916,0.0,31.0,1.008878,12500.0,52250
2,5594,2021-11-23,59.0,0.390969,0.0,31.0,1.035626,12500.0,52250
3,5594,2021-11-24,58.0,0.368718,0.0,31.0,1.044277,12500.0,52250
4,5594,2021-11-26,56.0,0.396545,0.0,31.0,1.065644,12500.0,52250


In [5]:
iv_base = (
    clean_opts
    .merge(equity, on=["permno", "date"], how="inner")
    .sort_values(["permno", "date"])
    .reset_index(drop=True)
)

iv_base.head()


Unnamed: 0,secid,date,dte,iv_call60,opt_volume,opt_open_interest,moneyness,strike_price,permno,ticker,close,ret
0,106500,2020-02-18,59.0,0.318465,0.0,2.0,0.871536,150000.0,10026,JJSF,172.110001,-0.008526
1,106500,2020-02-19,58.0,0.285934,0.0,2.0,0.862366,150000.0,10026,JJSF,173.940002,0.010633
2,106500,2020-02-20,57.0,0.277421,0.0,2.0,0.859205,150000.0,10026,JJSF,174.580002,0.003679
3,106500,2020-02-21,56.0,0.314588,0.0,2.0,0.856947,150000.0,10026,JJSF,175.039993,0.002635
4,106500,2020-03-17,59.0,0.673859,0.0,1.0,1.128102,135000.0,10026,JJSF,119.669998,0.086132


In [6]:
coverage = {
    "rows": len(iv_base),
    "symbols": iv_base["permno"].nunique(),
    "start": iv_base["date"].min(),
    "end": iv_base["date"].max(),
    "mean_dte": iv_base["dte"].mean(),
    "dte_band": (iv_base["dte"].min(), iv_base["dte"].max())
}
coverage


{'rows': 479488,
 'symbols': 3993,
 'start': Timestamp('2019-01-14 00:00:00'),
 'end': Timestamp('2023-08-25 00:00:00'),
 'mean_dte': 57.81256,
 'dte_band': (55.0, 60.0)}

In [7]:
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
iv_base.to_parquet(OUTPUT_PATH, index=False)
OUTPUT_PATH


PosixPath('/Users/nikhileshbelulkar/Documents/saturate_the_hypersphere/smile_smirks/processed_data/wq_iv_momentum_base.parquet')

In [8]:
iv_base.sample(5, random_state=42)


Unnamed: 0,secid,date,dte,iv_call60,opt_volume,opt_open_interest,moneyness,strike_price,permno,ticker,close,ret
213941,215739,2023-06-21,58.0,0.725155,92.0,2.0,1.10424,12500.0,21350,TASK,11.32,-0.029991
42038,161679,2021-03-24,58.0,0.350977,21.0,56.0,1.076321,55000.0,13802,APAM,51.099998,0.000196
283909,111969,2021-03-24,58.0,0.312165,0.0,1.0,1.006904,280000.0,62498,WST,278.079987,-0.018252
159999,213573,2021-07-22,57.0,1.706186,0.0,3.0,0.245098,2500.0,18457,THCB,10.2,-0.068493
116233,210643,2020-06-23,59.0,1.10503,8.0,46.0,0.94697,10000.0,16499,PK,10.56,0.022265
