# IV Skew Delta Data Loader

Builds the daily dataset for the IV skew momentum alpha (current IV skew minus its value 25 trading days ago). Output includes the delta factor, forward daily returns, and FF5 controls.


In [4]:
import pandas as pd
import numpy as np
import polars as pl
from pathlib import Path

pd.set_option('display.max_columns', None)
print('✓ Libraries ready')


✓ Libraries ready


In [5]:
RAW_DATA_DIR = Path('raw_data')
PROCESSED_DATA_DIR = Path('processed_data')
PROCESSED_DATA_DIR.mkdir(exist_ok=True)

OPTIONS_PATH = RAW_DATA_DIR / 'options_data.csv'
EQUITIES_PATH = RAW_DATA_DIR / 'all_equities.csv'
MAPPING_PATH = RAW_DATA_DIR / 'permno_secid_mapping.csv'
FF5_DAILY_PATH = RAW_DATA_DIR / 'F-F_Research_Data_5_Factors_2x3_daily.csv'

DTE_RANGE = (45, 75)
CALL_RANGE = (0.95, 1.05)
PUT_RANGE = (0.85, 1.05)
DELTA_WINDOW = 25

print('✓ Config set')
print(f'  Options file: {OPTIONS_PATH}')
print(f'  DTE range: {DTE_RANGE[0]}-{DTE_RANGE[1]} trading days (~2 months)')
print(f'  Delta window: {DELTA_WINDOW} trading days')


✓ Config set
  Options file: raw_data/options_data.csv
  DTE range: 45-75 trading days (~2 months)
  Delta window: 25 trading days


In [6]:
options_sample = pl.read_csv(OPTIONS_PATH, n_rows=5_000)
print(f"Options sample dates: {options_sample['date'].min()} to {options_sample['date'].max()}")
print(f"Columns: {options_sample.columns}")
options_sample.head()


Options sample dates: 2019-05-20 to 2023-08-31
Columns: ['secid', 'date', 'exdate', 'cp_flag', 'strike_price', 'volume', 'open_interest', 'impl_volatility', 'opprc', 'moneyness', 'tte', 'close', 'spread', 'mod_open_interest', 'noi']


secid,date,exdate,cp_flag,strike_price,volume,open_interest,impl_volatility,opprc,moneyness,tte,close,spread,mod_open_interest,noi
i64,str,str,str,i64,i64,i64,f64,f64,f64,i64,f64,f64,i64,i64
5594,"""2021-10-07""","""2021-11-19""","""C""",10000,0,2,0.422947,1.25,0.907441,43,11.02,0.4,2,2
5594,"""2021-10-08""","""2021-11-19""","""C""",10000,0,2,0.456293,1.375,0.897666,42,11.14,0.15,2,0
5594,"""2021-10-11""","""2021-11-19""","""C""",10000,0,2,0.438753,1.275,0.904159,39,11.06,0.35,2,0
5594,"""2021-10-12""","""2021-11-19""","""C""",10000,0,2,0.453386,1.3,0.902527,38,11.08,0.4,2,0
5594,"""2021-10-13""","""2021-11-19""","""C""",10000,0,2,0.424783,1.225,0.906618,37,11.03,0.35,2,0


In [7]:
print('Loading FF5 daily factors...')
ff5_daily = pd.read_csv(FF5_DAILY_PATH)
ff5_daily[["Mkt-RF", "SMB", "HML", "RMW", "CMA", "RF"]] = ff5_daily[["Mkt-RF", "SMB", "HML", "RMW", "CMA", "RF"]] / 100
ff5_daily_pl = (
    pl.from_pandas(ff5_daily)
    .with_columns(pl.col('Date').cast(pl.Int64).cast(pl.Utf8).str.to_date('%Y%m%d').alias('date'))
    .select(['date', 'Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF'])
)
print(f'✓ FF5 shape: {ff5_daily_pl.shape}')
ff5_daily_pl.head()


Loading FF5 daily factors...
✓ FF5 shape: (15667, 7)


date,Mkt-RF,SMB,HML,RMW,CMA,RF
date,f64,f64,f64,f64,f64,f64
1963-07-01,-0.0067,0.0,-0.0034,-0.0001,0.0016,0.0001
1963-07-02,0.0079,-0.0026,0.0026,-0.0007,-0.002,0.0001
1963-07-03,0.0063,-0.0017,-0.0009,0.0018,-0.0034,0.0001
1963-07-05,0.004,0.0008,-0.0027,0.0009,-0.0034,0.0001
1963-07-08,-0.0063,0.0004,-0.0018,-0.0029,0.0014,0.0001


In [8]:
def compute_daily_iv_skew(
    csv_path: str,
    dte_range: tuple[int, int] = (45, 75),
    call_range: tuple[float, float] = (0.95, 1.05),
    put_range: tuple[float, float] = (0.85, 1.05),
    streaming: bool = True,
) -> pl.DataFrame:
    c = pl.col
    lf = pl.scan_csv(
        csv_path,
        schema_overrides={
            'secid': pl.Int64,
            'date': pl.Date,
            'tte': pl.Int32,
            'impl_volatility': pl.Float32,
            'cp_flag': pl.Categorical,
            'moneyness': pl.Float32,
        },
    ).select(['secid', 'date', 'tte', 'impl_volatility', 'cp_flag', 'moneyness'])

    opt = (
        lf
        .filter(
            (c('tte') >= dte_range[0])
            & (c('tte') <= dte_range[1])
            & c('moneyness').is_not_null()
            & c('impl_volatility').is_finite()
        )
        .with_columns([
            pl.when(
                (c('cp_flag') == 'C')
                & (c('moneyness') >= call_range[0])
                & (c('moneyness') <= call_range[1])
            ).then(c('impl_volatility')).otherwise(None).alias('call_iv'),
            pl.when(
                (c('cp_flag') == 'P')
                & (c('moneyness') >= put_range[0])
                & (c('moneyness') <= put_range[1])
            ).then(c('impl_volatility')).otherwise(None).alias('put_iv'),
        ])
    )

    daily = (
        opt.group_by(['secid', 'date'])
           .agg([
               pl.mean('call_iv').alias('call_iv_d'),
               pl.mean('put_iv').alias('put_iv_d'),
           ])
           .filter(c('call_iv_d').is_not_null() & c('put_iv_d').is_not_null())
           .with_columns((c('call_iv_d') - c('put_iv_d')).alias('IV_skew'))
           .select(['secid', 'date', 'IV_skew'])
    )

    return daily.collect(streaming=streaming)

print('Computing daily IV skew...')
daily_iv_df = compute_daily_iv_skew(
    str(OPTIONS_PATH),
    dte_range=DTE_RANGE,
    call_range=CALL_RANGE,
    put_range=PUT_RANGE,
)
print(f'✓ Daily IV skew rows: {daily_iv_df.shape}')
daily_iv_df.head()


Computing daily IV skew...
✓ Daily IV skew rows: (866146, 3)


secid,date,IV_skew
i64,date,f32
137684,2020-10-05,0.009462
101966,2020-05-22,-0.022225
108236,2021-03-29,-0.03341
107823,2020-03-26,0.044398
166761,2023-08-01,-0.032095


In [16]:
print('Computing IV skew delta over 25 trading days...')
daily_iv_with_delta = (
    daily_iv_df
    .sort(['secid', 'date'])
    .with_columns([
        pl.col('IV_skew').shift(DELTA_WINDOW).over('secid').alias('IV_skew_lag25'),
    ])
    .with_columns((pl.col('IV_skew') - pl.col('IV_skew_lag25')).alias('IV_skew_delta_25'))
    .filter(pl.col('IV_skew_lag25').is_not_null())
)
print(f"✓ Delta rows (lag-complete): {daily_iv_with_delta.shape}")
daily_iv_with_delta.head()


Computing IV skew delta over 25 trading days...
✓ Delta rows (lag-complete): (779309, 5)


secid,date,IV_skew,IV_skew_lag25,IV_skew_delta_25
i64,date,f32,f32,f32
6646,2023-03-06,-0.174937,0.018285,-0.193222
6646,2023-04-04,-0.145058,0.098777,-0.243835
8170,2019-10-22,-0.041055,0.019375,-0.06043
8170,2019-10-23,0.058631,-0.070188,0.128819
8170,2019-10-24,0.078637,-0.009429,0.088066


In [17]:
print('Loading CRSP returns + mapping...')
ret_df = (
    pl.read_csv(EQUITIES_PATH, schema_overrides={'RET': pl.Utf8})
      .with_columns(pl.col('RET').cast(pl.Float64, strict=False))
)
map_df = pl.read_csv(MAPPING_PATH)
start_date = options_sample['date'].min()
filtered_map = map_df.filter(pl.col('edate') > start_date)

ret_with_ids = (
    ret_df.join(filtered_map, on='PERMNO')
          .with_columns([
              pl.col('date').str.to_date().alias('date'),
              (pl.col('date').str.to_date() - pl.duration(days=1)).alias('prev_date'),
              pl.col('RET').alias('next_return')
          ])
          .rename({'date': 'next_date'})
          .select(['PERMNO', 'secid', 'TICKER', 'COMNAM', 'PRC', 'next_return', 'next_date', 'prev_date'])
)
print(f"✓ Returns rows: {ret_with_ids.shape}")
ret_with_ids.head()


Loading CRSP returns + mapping...
✓ Returns rows: (14808336, 8)


PERMNO,secid,TICKER,COMNAM,PRC,next_return,next_date,prev_date
i64,i64,str,str,f64,f64,date,date
10026,106500,"""JJSF""","""J & J SNACK FOODS CORP""",141.0,-0.024829,2019-01-02,2019-01-01
10026,106500,"""JJSF""","""J & J SNACK FOODS CORP""",143.02,0.014326,2019-01-03,2019-01-02
10026,106500,"""JJSF""","""J & J SNACK FOODS CORP""",144.84,0.012725,2019-01-04,2019-01-03
10026,106500,"""JJSF""","""J & J SNACK FOODS CORP""",145.41,0.003935,2019-01-07,2019-01-06
10026,106500,"""JJSF""","""J & J SNACK FOODS CORP""",148.7,0.022626,2019-01-08,2019-01-07


In [18]:
print('Joining IV skew delta with forward returns...')
merged = (
    daily_iv_with_delta.rename({'date': 'iv_date'})
    .join(ret_with_ids, left_on=['secid', 'iv_date'], right_on=['secid', 'prev_date'], how='left')
)
print(f'✓ Merge shape: {merged.shape}')
merged.head()


Joining IV skew delta with forward returns...
✓ Merge shape: (779310, 11)


secid,iv_date,IV_skew,IV_skew_lag25,IV_skew_delta_25,PERMNO,TICKER,COMNAM,PRC,next_return,next_date
i64,date,f32,f32,f32,i64,str,str,f64,f64,date
6646,2023-03-06,-0.174937,0.018285,-0.193222,75672,"""WWR""","""WESTWATER RESOURCES INC""",1.09,0.128364,2023-03-07
6646,2023-04-04,-0.145058,0.098777,-0.243835,75672,"""WWR""","""WESTWATER RESOURCES INC""",0.9999,-0.038558,2023-04-05
8170,2019-10-22,-0.041055,0.019375,-0.06043,80341,"""MPAA""","""MOTORCAR PARTS OF AMERICA INC""",17.61,0.026224,2019-10-23
8170,2019-10-23,0.058631,-0.070188,0.128819,80341,"""MPAA""","""MOTORCAR PARTS OF AMERICA INC""",17.81,0.011357,2019-10-24
8170,2019-10-24,0.078637,-0.009429,0.088066,80341,"""MPAA""","""MOTORCAR PARTS OF AMERICA INC""",18.24,0.024144,2019-10-25


In [19]:
print('Adding FF5 controls and computing excess return...')
merged_with_ff5 = (
    merged
    .join(ff5_daily_pl, left_on='next_date', right_on='date', how='left', suffix='_ff')
    .drop('date', strict=False)
    .with_columns((pl.col('next_return') - pl.col('RF')).alias('excess_return'))
)
print(f'✓ Final shape pre-clean: {merged_with_ff5.shape}')
merged_with_ff5.head()


Adding FF5 controls and computing excess return...
✓ Final shape pre-clean: (779310, 18)


secid,iv_date,IV_skew,IV_skew_lag25,IV_skew_delta_25,PERMNO,TICKER,COMNAM,PRC,next_return,next_date,Mkt-RF,SMB,HML,RMW,CMA,RF,excess_return
i64,date,f32,f32,f32,i64,str,str,f64,f64,date,f64,f64,f64,f64,f64,f64,f64
6646,2023-03-06,-0.174937,0.018285,-0.193222,75672,"""WWR""","""WESTWATER RESOURCES INC""",1.09,0.128364,2023-03-07,-0.0145,0.0063,-0.0067,-0.0013,-0.0012,0.0002,0.128164
6646,2023-04-04,-0.145058,0.098777,-0.243835,75672,"""WWR""","""WESTWATER RESOURCES INC""",0.9999,-0.038558,2023-04-05,-0.0039,-0.01,0.014,0.0079,0.0129,0.0002,-0.038758
8170,2019-10-22,-0.041055,0.019375,-0.06043,80341,"""MPAA""","""MOTORCAR PARTS OF AMERICA INC""",17.61,0.026224,2019-10-23,0.0025,-0.001,0.0027,-0.0001,0.0009,0.0001,0.026124
8170,2019-10-23,0.058631,-0.070188,0.128819,80341,"""MPAA""","""MOTORCAR PARTS OF AMERICA INC""",17.81,0.011357,2019-10-24,0.0025,-0.005,-0.0089,-0.0005,-0.0061,0.0001,0.011257
8170,2019-10-24,0.078637,-0.009429,0.088066,80341,"""MPAA""","""MOTORCAR PARTS OF AMERICA INC""",18.24,0.024144,2019-10-25,0.005,0.004,0.0007,0.0034,0.0,0.0001,0.024044


In [20]:
required_cols = [
    'IV_skew', 'IV_skew_lag25', 'IV_skew_delta_25',
    'next_return', 'excess_return',
    'Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF'
]
complete = merged_with_ff5.drop_nulls(required_cols)
print(f"Complete rows: {complete.shape[0]:,} / {merged_with_ff5.shape[0]:,}")


Complete rows: 624,626 / 779,310


In [21]:
output_path = PROCESSED_DATA_DIR / 'daily_iv_skew_delta.parquet'
complete.write_parquet(output_path)
print(f'✓ Saved IV skew delta dataset to {output_path}')
print(f"  Rows: {complete.shape[0]:,}")
print(f"  Unique secids: {complete['secid'].n_unique()}")


✓ Saved IV skew delta dataset to processed_data/daily_iv_skew_delta.parquet
  Rows: 624,626
  Unique secids: 3140


In [22]:
factor_stats = complete.select([
    pl.col('IV_skew_delta_25').mean().alias('mean_delta'),
    pl.col('IV_skew_delta_25').std().alias('std_delta'),
    pl.col('IV_skew_delta_25').quantile(0.05).alias('p05'),
    pl.col('IV_skew_delta_25').quantile(0.5).alias('median'),
    pl.col('IV_skew_delta_25').quantile(0.95).alias('p95'),
])
factor_stats.to_pandas()


Unnamed: 0,mean_delta,std_delta,p05,median,p95
0,0.00066,0.098721,-0.086146,0.000318,0.088547
