In [3]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
import polars as pl 

# Statistical modeling and diagnostics
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.diagnostic import het_breuschpagan, het_white
from statsmodels.stats.stattools import jarque_bera

# Machine learning and preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data source
from ucimlrepo import fetch_ucirepo

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 11

plt.rcParams['figure.constrained_layout.use'] = True
print("All libraries imported successfully")

All libraries imported successfully


### Importing options data

In [4]:
option_df = pl.read_csv('raw_data/options_data.csv')

In [5]:
option_df

secid,date,exdate,cp_flag,strike_price,volume,open_interest,impl_volatility,opprc,moneyness,tte,close,spread,mod_open_interest,noi
i64,str,str,str,i64,i64,i64,f64,f64,f64,i64,f64,f64,i64,i64
5594,"""2021-10-07""","""2021-11-19""","""C""",10000,0,2,0.422947,1.25,0.907441,43,11.02,0.4,2,2
5594,"""2021-10-08""","""2021-11-19""","""C""",10000,0,2,0.456293,1.375,0.897666,42,11.14,0.15,2,0
5594,"""2021-10-11""","""2021-11-19""","""C""",10000,0,2,0.438753,1.275,0.904159,39,11.06,0.35,2,0
5594,"""2021-10-12""","""2021-11-19""","""C""",10000,0,2,0.453386,1.3,0.902527,38,11.08,0.4,2,0
5594,"""2021-10-13""","""2021-11-19""","""C""",10000,0,2,0.424783,1.225,0.906618,37,11.03,0.35,2,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
218609,"""2023-08-31""","""2023-09-15""","""P""",90000,0,1,0.477438,11.55,1.14431,15,78.65,2.5,,
218609,"""2023-08-31""","""2023-10-20""","""P""",60000,0,2,0.506986,0.425,0.762873,50,78.65,0.65,,
218609,"""2023-08-31""","""2023-10-20""","""P""",70000,0,19,0.33145,0.825,0.890019,50,78.65,0.15,,
218609,"""2023-08-31""","""2023-10-20""","""P""",75000,0,16,0.24364,1.3,0.953592,50,78.65,1.0,,


In [6]:
start_date = option_df['date'].min()
end_date = option_df['date'].max()
print("Start date =", start_date)
print("End date =", end_date)

Start date = 2019-01-02
End date = 2023-08-31


## Importing returns data

In [7]:
ret_df = (
    pl.read_csv(
        "./raw_data/all_equities.csv",
        schema_overrides={"RET": pl.Utf8}
    )
    .with_columns(pl.col("RET").cast(pl.Float64, strict=False))
)

In [8]:
ret_df

PERMNO,date,TICKER,COMNAM,PRC,RET
i64,str,str,str,f64,f64
10026,"""2019-01-02""","""JJSF""","""J & J SNACK FOODS CORP""",141.0,-0.024829
10026,"""2019-01-03""","""JJSF""","""J & J SNACK FOODS CORP""",143.02,0.014326
10026,"""2019-01-04""","""JJSF""","""J & J SNACK FOODS CORP""",144.84,0.012725
10026,"""2019-01-07""","""JJSF""","""J & J SNACK FOODS CORP""",145.41,0.003935
10026,"""2019-01-08""","""JJSF""","""J & J SNACK FOODS CORP""",148.7,0.022626
…,…,…,…,…,…
93436,"""2023-12-22""","""TSLA""","""TESLA INC""",252.53999,-0.007701
93436,"""2023-12-26""","""TSLA""","""TESLA INC""",256.60999,0.016116
93436,"""2023-12-27""","""TSLA""","""TESLA INC""",261.44,0.018822
93436,"""2023-12-28""","""TSLA""","""TESLA INC""",253.17999,-0.031594


## Mapping sec_id and PERMNO to join them

In [9]:
map_df = pl.read_csv("./raw_data/permno_secid_mapping.csv")

In [10]:
filtered_map = map_df.filter(pl.col('edate')>start_date)
filtered_map

secid,sdate,edate,PERMNO
i64,str,str,i64
5111,"""2021-03-18""","""2023-02-02""",20768
5121,"""2018-02-28""","""2019-08-13""",17295
5131,"""2007-04-02""","""2024-05-02""",88960
5139,"""2002-07-29""","""2024-12-31""",89462
5166,"""2014-01-15""","""2022-05-06""",14380
…,…,…,…
219171,"""2024-01-04""","""2024-12-31""",24747
219172,"""2024-01-04""","""2024-12-31""",24746
219173,"""2024-01-04""","""2024-12-31""",24718
219174,"""2024-01-25""","""2024-12-31""",24685


## Merging secid and permno 

In [11]:
ret_df = ret_df.join(filtered_map,on="PERMNO")
ret_df

PERMNO,date,TICKER,COMNAM,PRC,RET,secid,sdate,edate
i64,str,str,str,f64,f64,i64,str,str
10026,"""2019-01-02""","""JJSF""","""J & J SNACK FOODS CORP""",141.0,-0.024829,106500,"""1996-01-02""","""2024-12-31"""
10026,"""2019-01-03""","""JJSF""","""J & J SNACK FOODS CORP""",143.02,0.014326,106500,"""1996-01-02""","""2024-12-31"""
10026,"""2019-01-04""","""JJSF""","""J & J SNACK FOODS CORP""",144.84,0.012725,106500,"""1996-01-02""","""2024-12-31"""
10026,"""2019-01-07""","""JJSF""","""J & J SNACK FOODS CORP""",145.41,0.003935,106500,"""1996-01-02""","""2024-12-31"""
10026,"""2019-01-08""","""JJSF""","""J & J SNACK FOODS CORP""",148.7,0.022626,106500,"""1996-01-02""","""2024-12-31"""
…,…,…,…,…,…,…,…,…
93436,"""2023-12-22""","""TSLA""","""TESLA INC""",252.53999,-0.007701,143439,"""2010-06-29""","""2024-12-31"""
93436,"""2023-12-26""","""TSLA""","""TESLA INC""",256.60999,0.016116,143439,"""2010-06-29""","""2024-12-31"""
93436,"""2023-12-27""","""TSLA""","""TESLA INC""",261.44,0.018822,143439,"""2010-06-29""","""2024-12-31"""
93436,"""2023-12-28""","""TSLA""","""TESLA INC""",253.17999,-0.031594,143439,"""2010-06-29""","""2024-12-31"""


### Creating formward looking returns for each row of the option or rolling up the options to look at average skew and then also the forward looking returns for the next week

weekly_returns = 

### Modifying options to look at the average vol skew in the previous week for options that are of the same expiration date and relative moneyness

In [27]:
def compute_weekly_iv_skew_streaming(
    csv_path: str,
    dte_limit: int = 60,
    call_range: tuple[float, float] = (0.95, 1.05),
    put_range: tuple[float, float] = (0.85, 1.05),
    weight_scheme: str = "weekday",   # "equal" | "weekday" | "linear" | "exp"
    alpha: float = 0.3,               # for "exp": weight = exp(alpha * (pos-1))
    streaming: bool = True,           # set False if using "linear"/"exp" (windowed)
) -> pl.DataFrame:
    c = pl.col

    # Use scan_csv to stay lazy/streaming; parse date at read-time
    lf = pl.scan_csv(
    csv_path,
    schema_overrides={
        "secid": pl.Int64,
        "date": pl.Date,
        "tte": pl.Int32,
        "impl_volatility": pl.Float32,
        "cp_flag": pl.Categorical,
        "moneyness": pl.Float32,
    },
).select(["secid", "date", "tte", "impl_volatility", "cp_flag", "moneyness"])

    opt = (
        lf
        .filter(
            (c("tte") < dte_limit)
            & c("moneyness").is_not_null()
            & c("impl_volatility").is_finite()
        )
        .with_columns([
            # Monday-anchored calendar week; only trading days exist
            c("date").dt.truncate("1w").alias("week_start"),
            # mark eligible call/put IVs
            pl.when(
                (c("cp_flag") == "C")
                & (c("moneyness") >= call_range[0])
                & (c("moneyness") <= call_range[1])
            ).then(c("impl_volatility")).otherwise(None).alias("call_iv"),
            pl.when(
                (c("cp_flag") == "P")
                & (c("moneyness") >= put_range[0])
                & (c("moneyness") <= put_range[1])
            ).then(c("impl_volatility")).otherwise(None).alias("put_iv"),
        ])
    )

    # Daily skew per secid/date
    daily = (
        opt.group_by(["secid", "date", "week_start"])
           .agg([
               pl.mean("call_iv").alias("call_iv_d"),
               pl.mean("put_iv").alias("put_iv_d"),
           ])
           .filter(c("call_iv_d").is_not_null() & c("put_iv_d").is_not_null())
           .with_columns((c("call_iv_d") - c("put_iv_d")).alias("skew_d"))
    )

    # Weights
    if weight_scheme == "equal":
        daily_w = daily.with_columns(pl.lit(1.0).alias("w"))
        allow_stream = True
    elif weight_scheme == "weekday":
        # Mon..Fri -> 1..5 (holidays/short weeks handled naturally)
        daily_w = daily.with_columns((c("date").dt.weekday() + 1).cast(pl.Float32).alias("w"))
        allow_stream = True
    elif weight_scheme in ("linear", "exp"):
        # Position within week (1..N) requires a window; may disable streaming
        daily_w = (
            daily.sort(["secid", "week_start", "date"])
                 .with_columns((pl.cum_count().over(["secid", "week_start"]) + 1).alias("pos"))
                 .with_columns(
                     pl.when(weight_scheme == "linear")
                       .then(c("pos").cast(pl.Float32))
                       .otherwise((pl.lit(alpha) * (c("pos") - 1)).exp().cast(pl.Float32))
                       .alias("w")
                 )
        )
        allow_stream = False
    else:
        raise ValueError("weight_scheme must be one of {'equal','weekday','linear','exp'}")

    weekly = (
        daily_w.group_by(["secid", "week_start"])
               .agg([
                   ((c("skew_d") * c("w")).sum() / c("w").sum()).alias("IV_skew"),
                   pl.max("date").alias("week_end"),  # last trading day in week
               ])
               .sort(["secid", "week_start"])
               .select(["secid", "week_start", "week_end", "IV_skew"])
    )

    return weekly.collect(streaming=(streaming and allow_stream))

In [28]:
weekly_option_df = compute_weekly_iv_skew_streaming(
    "./raw_data/options_data.csv",
    dte_limit=60,
    weight_scheme="weekday",   # or "equal" / "linear" / "exp"
    alpha=0.4,                 # only used for "exp"
)



In [29]:
weekly_option_df


secid,week_start,week_end,IV_skew
i64,date,date,f32
5594,2021-11-08,2021-11-11,0.076602
5594,2021-11-15,2021-11-19,0.028996
5594,2021-11-22,2021-11-24,0.044062
6646,2020-10-26,2020-10-26,-0.037153
6646,2020-11-16,2020-11-18,0.015387
…,…,…,…
218532,2023-08-21,2023-08-25,0.010728
218532,2023-08-28,2023-08-31,0.503919
218609,2023-08-14,2023-08-18,0.011428
218609,2023-08-21,2023-08-25,-0.004274
