# Data Loading and Preprocessing Pipeline

This notebook loads and preprocesses all raw data for volatility smile/smirk analysis:
- Options data (implied volatility, strike prices, moneyness)
- Equity returns (daily from CRSP)
- Security identifier mappings (PERMNO-secid)
- Fama-French 3 factors (weekly)

**Output**: Clean merged dataset saved as `processed_data/merged_data_with_ff3.parquet`


## 1. Setup and Configuration


In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import polars as pl
import os
from pathlib import Path

print("✓ Libraries imported successfully")


✓ Libraries imported successfully


In [None]:
# Configuration
RAW_DATA_DIR = Path('raw_data')
PROCESSED_DATA_DIR = Path('processed_data')

# Create output directory
PROCESSED_DATA_DIR.mkdir(exist_ok=True)

# Data paths
OPTIONS_PATH = RAW_DATA_DIR / 'options_data.csv'
EQUITIES_PATH = RAW_DATA_DIR / 'all_equities.csv'
MAPPING_PATH = RAW_DATA_DIR / 'permno_secid_mapping.csv'
FF3_PATH = RAW_DATA_DIR / 'F-F_Research_Data_Factors_weekly.csv'

# IV skew computation parameters
DTE_LIMIT = 60  # Days to expiration limit
CALL_RANGE = (0.95, 1.05)  # Moneyness range for calls
PUT_RANGE = (0.85, 1.05)   # Moneyness range for puts
WEIGHT_SCHEME = "weekday"  # Options: "equal", "weekday", "linear", "exp"
ALPHA = 0.4  # Only used for exponential weighting

print(f"✓ Configuration set")
print(f"  Raw data directory: {RAW_DATA_DIR}")
print(f"  Output directory: {PROCESSED_DATA_DIR}")


✓ Configuration set
  Raw data directory: raw_data
  Output directory: processed_data


## 2. Load Raw Data


### 2.1 Load Options Data (Quick Inspection)


In [3]:
# Quick inspection - load first few rows to check date range
options_sample = pl.read_csv(OPTIONS_PATH, n_rows=10000)
print(f"Options data columns: {options_sample.columns}")
print(f"\nDate range (sample): {options_sample['date'].min()} to {options_sample['date'].max()}")
print(f"Unique securities (sample): {options_sample['secid'].n_unique()}")
options_sample.head()


Options data columns: ['secid', 'date', 'exdate', 'cp_flag', 'strike_price', 'volume', 'open_interest', 'impl_volatility', 'opprc', 'moneyness', 'tte', 'close', 'spread', 'mod_open_interest', 'noi']

Date range (sample): 2019-01-02 to 2023-08-31
Unique securities (sample): 4


secid,date,exdate,cp_flag,strike_price,volume,open_interest,impl_volatility,opprc,moneyness,tte,close,spread,mod_open_interest,noi
i64,str,str,str,i64,i64,i64,f64,f64,f64,i64,f64,f64,i64,i64
5594,"""2021-10-07""","""2021-11-19""","""C""",10000,0,2,0.422947,1.25,0.907441,43,11.02,0.4,2,2
5594,"""2021-10-08""","""2021-11-19""","""C""",10000,0,2,0.456293,1.375,0.897666,42,11.14,0.15,2,0
5594,"""2021-10-11""","""2021-11-19""","""C""",10000,0,2,0.438753,1.275,0.904159,39,11.06,0.35,2,0
5594,"""2021-10-12""","""2021-11-19""","""C""",10000,0,2,0.453386,1.3,0.902527,38,11.08,0.4,2,0
5594,"""2021-10-13""","""2021-11-19""","""C""",10000,0,2,0.424783,1.225,0.906618,37,11.03,0.35,2,0


In [4]:
# Load equity returns with proper schema
print("Loading equity returns...")
ret_df = (
    pl.read_csv(
        EQUITIES_PATH,
        schema_overrides={"RET": pl.Utf8}
    )
    .with_columns(pl.col("RET").cast(pl.Float64, strict=False))
)

print(f"✓ Loaded equity returns: {ret_df.shape}")
print(f"  Date range: {ret_df['date'].min()} to {ret_df['date'].max()}")
print(f"  Unique securities: {ret_df['PERMNO'].n_unique()}")
ret_df.head()


Loading equity returns...
✓ Loaded equity returns: (10795504, 6)
  Date range: 2019-01-02 to 2023-12-29
  Unique securities: 12297


PERMNO,date,TICKER,COMNAM,PRC,RET
i64,str,str,str,f64,f64
10026,"""2019-01-02""","""JJSF""","""J & J SNACK FOODS CORP""",141.0,-0.024829
10026,"""2019-01-03""","""JJSF""","""J & J SNACK FOODS CORP""",143.02,0.014326
10026,"""2019-01-04""","""JJSF""","""J & J SNACK FOODS CORP""",144.84,0.012725
10026,"""2019-01-07""","""JJSF""","""J & J SNACK FOODS CORP""",145.41,0.003935
10026,"""2019-01-08""","""JJSF""","""J & J SNACK FOODS CORP""",148.7,0.022626


### 2.3 Load Security Identifier Mapping


In [5]:
# Load PERMNO-secid mapping
print("Loading security identifier mapping...")
map_df = pl.read_csv(MAPPING_PATH)

# Get date range from options data for filtering
start_date = options_sample['date'].min()
filtered_map = map_df.filter(pl.col('edate') > start_date)

print(f"✓ Loaded mapping: {filtered_map.shape}")
print(f"  Unique PERMNOs: {filtered_map['PERMNO'].n_unique()}")
print(f"  Unique secids: {filtered_map['secid'].n_unique()}")
filtered_map.head()


Loading security identifier mapping...
✓ Loaded mapping: (16461, 4)
  Unique PERMNOs: 12492
  Unique secids: 16240


secid,sdate,edate,PERMNO
i64,str,str,i64
5111,"""2021-03-18""","""2023-02-02""",20768
5121,"""2018-02-28""","""2019-08-13""",17295
5131,"""2007-04-02""","""2024-05-02""",88960
5139,"""2002-07-29""","""2024-12-31""",89462
5166,"""2014-01-15""","""2022-05-06""",14380


### 2.4 Load Fama-French 3 Factors


In [6]:
# Load FF3 factors
print("Loading Fama-French 3 factors...")
ff3_factors = pd.read_csv(FF3_PATH)

# Convert percentages to decimals
ff3_factors[['Mkt-RF', 'SMB', 'HML', 'RF']] = ff3_factors[['Mkt-RF', 'SMB', 'HML', 'RF']] / 100

# Convert to Polars and parse dates
ff3_factors_pl = (
    pl.from_pandas(ff3_factors)
    .with_columns([
        pl.col("Date").cast(pl.Int64).cast(pl.Utf8)
          .str.to_date("%Y%m%d")
          .dt.truncate("1w")
          .alias("week_start")
    ])
    .drop("Date")
)

print(f"✓ Loaded FF3 factors: {ff3_factors_pl.shape}")
print(f"  Columns: {ff3_factors_pl.columns}")
ff3_factors_pl.head()


Loading Fama-French 3 factors...
✓ Loaded FF3 factors: (5174, 5)
  Columns: ['Mkt-RF', 'SMB', 'HML', 'RF', 'week_start']


Mkt-RF,SMB,HML,RF,week_start
f64,f64,f64,f64,date
0.0158,-0.0062,-0.0086,0.0006,1926-06-28
0.0037,-0.009,0.0031,0.0006,1926-07-05
0.0098,0.0059,-0.0144,0.0006,1926-07-12
-0.0203,0.0002,-0.0017,0.0006,1926-07-19
0.0306,-0.0189,-0.0085,0.0006,1926-07-26


## 3. Data Transformation and Merging


### 3.1 Add Security IDs to Returns Data


In [7]:
# Merge returns with security mapping
print("Merging returns with security identifiers...")
ret_df = ret_df.join(filtered_map, on="PERMNO")

print(f"✓ Merged returns data: {ret_df.shape}")
print(f"  Columns: {ret_df.columns}")
ret_df.head()


Merging returns with security identifiers...
✓ Merged returns data: (14832526, 9)
  Columns: ['PERMNO', 'date', 'TICKER', 'COMNAM', 'PRC', 'RET', 'secid', 'sdate', 'edate']


PERMNO,date,TICKER,COMNAM,PRC,RET,secid,sdate,edate
i64,str,str,str,f64,f64,i64,str,str
10026,"""2019-01-02""","""JJSF""","""J & J SNACK FOODS CORP""",141.0,-0.024829,106500,"""1996-01-02""","""2024-12-31"""
10026,"""2019-01-03""","""JJSF""","""J & J SNACK FOODS CORP""",143.02,0.014326,106500,"""1996-01-02""","""2024-12-31"""
10026,"""2019-01-04""","""JJSF""","""J & J SNACK FOODS CORP""",144.84,0.012725,106500,"""1996-01-02""","""2024-12-31"""
10026,"""2019-01-07""","""JJSF""","""J & J SNACK FOODS CORP""",145.41,0.003935,106500,"""1996-01-02""","""2024-12-31"""
10026,"""2019-01-08""","""JJSF""","""J & J SNACK FOODS CORP""",148.7,0.022626,106500,"""1996-01-02""","""2024-12-31"""


### 3.2 Compute Weekly Returns from Daily Returns


In [8]:
# Aggregate daily returns to weekly returns
print("Computing weekly returns...")
weekly_ret_df = (
    ret_df
    .with_columns([
        pl.col("date").str.to_date().alias("date"),
    ])
    .with_columns([
        pl.col("date").dt.truncate("1w").alias("week_start"),
    ])
    .filter(pl.col("RET").is_not_null() & pl.col("RET").is_finite())
    .group_by(["PERMNO", "week_start"])
    .agg([
        # Cumulative weekly return: product of (1 + daily returns) - 1
        ((pl.col("RET") + 1).product() - 1).alias("weekly_return"),
        pl.max("date").alias("week_end"),
        pl.first("TICKER").alias("TICKER"),
        pl.first("COMNAM").alias("COMNAM"),
        pl.len().alias("trading_days"),
    ])
    .sort(["PERMNO", "week_start"])
)

# Add secid for joining with options
weekly_ret_df = weekly_ret_df.join(filtered_map, on="PERMNO")

print(f"✓ Weekly returns computed: {weekly_ret_df.shape}")
print(f"  Date range: {weekly_ret_df['week_start'].min()} to {weekly_ret_df['week_end'].max()}")
print(f"  Unique securities: {weekly_ret_df['PERMNO'].n_unique()}")
print(f"\nWeekly Return Statistics:")
print(weekly_ret_df['weekly_return'].describe())
weekly_ret_df.head()


Computing weekly returns...
✓ Weekly returns computed: (3070490, 10)
  Date range: 2018-12-31 to 2023-12-29
  Unique securities: 12267

Weekly Return Statistics:
shape: (9, 2)
┌────────────┬────────────┐
│ statistic  ┆ value      │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 3.07049e6  │
│ null_count ┆ 0.0        │
│ mean       ┆ 0.006488   │
│ std        ┆ 0.303717   │
│ min        ┆ -0.999969  │
│ 25%        ┆ -0.030954  │
│ 50%        ┆ 0.001372   │
│ 75%        ┆ 0.033932   │
│ max        ┆ 297.803918 │
└────────────┴────────────┘


PERMNO,week_start,weekly_return,week_end,TICKER,COMNAM,trading_days,secid,sdate,edate
i64,date,f64,date,str,str,u32,i64,str,str
10026,2018-12-31,0.001728,2019-01-04,"""JJSF""","""J & J SNACK FOODS CORP""",3,106500,"""1996-01-02""","""2024-12-31"""
10026,2019-01-07,0.027962,2019-01-11,"""JJSF""","""J & J SNACK FOODS CORP""",5,106500,"""1996-01-02""","""2024-12-31"""
10026,2019-01-14,-0.011082,2019-01-18,"""JJSF""","""J & J SNACK FOODS CORP""",5,106500,"""1996-01-02""","""2024-12-31"""
10026,2019-01-21,-0.018268,2019-01-25,"""JJSF""","""J & J SNACK FOODS CORP""",4,106500,"""1996-01-02""","""2024-12-31"""
10026,2019-01-28,0.070909,2019-02-01,"""JJSF""","""J & J SNACK FOODS CORP""",5,106500,"""1996-01-02""","""2024-12-31"""


## 4. Compute Weekly Implied Volatility Skew


### 4.1 Define IV Skew Computation Function


In [9]:
def compute_weekly_iv_skew_streaming(
    csv_path: str,
    dte_limit: int = 60,
    call_range: tuple[float, float] = (0.95, 1.05),
    put_range: tuple[float, float] = (0.85, 1.05),
    weight_scheme: str = "weekday",
    alpha: float = 0.3,
    streaming: bool = True,
) -> pl.DataFrame:
    """
    Compute weekly implied volatility skew from options data.
    
    Parameters:
    -----------
    csv_path : str
        Path to options data CSV file
    dte_limit : int
        Maximum days to expiration for options to include
    call_range : tuple
        (min, max) moneyness range for calls
    put_range : tuple
        (min, max) moneyness range for puts
    weight_scheme : str
        Weighting scheme: "equal", "weekday", "linear", "exp"
    alpha : float
        Parameter for exponential weighting
    streaming : bool
        Whether to use streaming mode (faster, lower memory)
    
    Returns:
    --------
    pl.DataFrame with columns: [secid, week_start, week_end, IV_skew]
    """
    c = pl.col

    # Use scan_csv for lazy/streaming evaluation
    lf = pl.scan_csv(
        csv_path,
        schema_overrides={
            "secid": pl.Int64,
            "date": pl.Date,
            "tte": pl.Int32,
            "impl_volatility": pl.Float32,
            "cp_flag": pl.Categorical,
            "moneyness": pl.Float32,
        },
    ).select(["secid", "date", "tte", "impl_volatility", "cp_flag", "moneyness"])

    # Filter and prepare data
    opt = (
        lf
        .filter(
            (c("tte") < dte_limit)
            & c("moneyness").is_not_null()
            & c("impl_volatility").is_finite()
        )
        .with_columns([
            # Monday-anchored calendar week
            c("date").dt.truncate("1w").alias("week_start"),
            # Mark eligible call/put IVs
            pl.when(
                (c("cp_flag") == "C")
                & (c("moneyness") >= call_range[0])
                & (c("moneyness") <= call_range[1])
            ).then(c("impl_volatility")).otherwise(None).alias("call_iv"),
            pl.when(
                (c("cp_flag") == "P")
                & (c("moneyness") >= put_range[0])
                & (c("moneyness") <= put_range[1])
            ).then(c("impl_volatility")).otherwise(None).alias("put_iv"),
        ])
    )

    # Compute daily skew per secid/date
    daily = (
        opt.group_by(["secid", "date", "week_start"])
           .agg([
               pl.mean("call_iv").alias("call_iv_d"),
               pl.mean("put_iv").alias("put_iv_d"),
           ])
           .filter(c("call_iv_d").is_not_null() & c("put_iv_d").is_not_null())
           .with_columns((c("call_iv_d") - c("put_iv_d")).alias("skew_d"))
    )

    # Apply weighting scheme
    if weight_scheme == "equal":
        daily_w = daily.with_columns(pl.lit(1.0).alias("w"))
        allow_stream = True
    elif weight_scheme == "weekday":
        # Mon..Fri -> 1..5
        daily_w = daily.with_columns((c("date").dt.weekday() + 1).cast(pl.Float32).alias("w"))
        allow_stream = True
    elif weight_scheme in ("linear", "exp"):
        # Position within week (1..N)
        daily_w = (
            daily.sort(["secid", "week_start", "date"])
                 .with_columns((pl.cum_count().over(["secid", "week_start"]) + 1).alias("pos"))
                 .with_columns(
                     pl.when(weight_scheme == "linear")
                       .then(c("pos").cast(pl.Float32))
                       .otherwise((pl.lit(alpha) * (c("pos") - 1)).exp().cast(pl.Float32))
                       .alias("w")
                 )
        )
        allow_stream = False
    else:
        raise ValueError("weight_scheme must be one of {'equal','weekday','linear','exp'}")

    # Aggregate to weekly skew
    weekly = (
        daily_w.group_by(["secid", "week_start"])
               .agg([
                   ((c("skew_d") * c("w")).sum() / c("w").sum()).alias("IV_skew"),
                   pl.max("date").alias("week_end"),
               ])
               .sort(["secid", "week_start"])
               .select(["secid", "week_start", "week_end", "IV_skew"])
    )

    return weekly.collect(streaming=(streaming and allow_stream))

print("✓ IV skew function defined")


✓ IV skew function defined


In [10]:
# Compute weekly IV skew (this may take a few minutes)
print(f"Computing weekly IV skew with parameters:")
print(f"  DTE limit: {DTE_LIMIT}")
print(f"  Call range: {CALL_RANGE}")
print(f"  Put range: {PUT_RANGE}")
print(f"  Weight scheme: {WEIGHT_SCHEME}")
print(f"\nProcessing large options dataset...")

weekly_option_df = compute_weekly_iv_skew_streaming(
    str(OPTIONS_PATH),
    dte_limit=DTE_LIMIT,
    call_range=CALL_RANGE,
    put_range=PUT_RANGE,
    weight_scheme=WEIGHT_SCHEME,
    alpha=ALPHA,
)

print(f"\n✓ Weekly IV skew computed: {weekly_option_df.shape}")
print(f"  Unique securities: {weekly_option_df['secid'].n_unique()}")
print(f"  Date range: {weekly_option_df['week_start'].min()} to {weekly_option_df['week_end'].max()}")
print(f"\nIV Skew Statistics:")
print(weekly_option_df['IV_skew'].describe())
weekly_option_df.head()


Computing weekly IV skew with parameters:
  DTE limit: 60
  Call range: (0.95, 1.05)
  Put range: (0.85, 1.05)
  Weight scheme: weekday

Processing large options dataset...

✓ Weekly IV skew computed: (514760, 4)
  Unique securities: 3957
  Date range: 2018-12-31 to 2023-08-31

IV Skew Statistics:
shape: (9, 2)
┌────────────┬───────────┐
│ statistic  ┆ value     │
│ ---        ┆ ---       │
│ str        ┆ f64       │
╞════════════╪═══════════╡
│ count      ┆ 514760.0  │
│ null_count ┆ 0.0       │
│ mean       ┆ 0.003414  │
│ std        ┆ 0.124987  │
│ min        ┆ -5.996383 │
│ 25%        ┆ -0.026518 │
│ 50%        ┆ -0.00973  │
│ 75%        ┆ 0.016515  │
│ max        ┆ 7.52255   │
└────────────┴───────────┘


secid,week_start,week_end,IV_skew
i64,date,date,f32
5594,2021-11-08,2021-11-11,0.076602
5594,2021-11-15,2021-11-19,0.028996
5594,2021-11-22,2021-11-24,0.044062
6646,2020-10-26,2020-10-26,-0.037153
6646,2020-11-16,2020-11-18,0.015387


### 4.3 Save Weekly Options Data


In [11]:
# Save intermediate result
weekly_options_path = PROCESSED_DATA_DIR / 'weekly_option_df.parquet'
weekly_option_df.write_parquet(weekly_options_path)

print(f"✓ Weekly options data saved to: {weekly_options_path}")
print(f"  Size on disk: {os.path.getsize(weekly_options_path) / (1024**2):.2f} MB")


✓ Weekly options data saved to: processed_data/weekly_option_df.parquet
  Size on disk: 2.16 MB


## 5. Merge All Data Sources


### 5.1 Join Options with Returns (Forward-Looking)


In [12]:
# Join options skew with forward returns
# The IV_skew on week_start should predict returns in the following week
print("Joining options data with forward returns...")

merged_df = (
    weekly_option_df
    .join(
        weekly_ret_df.with_columns([
            (pl.col("week_start") - pl.duration(days=7)).alias("prev_week_start")
        ]),
        left_on=["secid", "week_start"],
        right_on=["secid", "prev_week_start"],
        how="left"
    )
)

print(f"✓ Merged options and returns: {merged_df.shape}")
print(f"  Columns: {merged_df.columns}")
merged_df.head()


Joining options data with forward returns...
✓ Merged options and returns: (514768, 13)
  Columns: ['secid', 'week_start', 'week_end', 'IV_skew', 'PERMNO', 'week_start_right', 'weekly_return', 'week_end_right', 'TICKER', 'COMNAM', 'trading_days', 'sdate', 'edate']


secid,week_start,week_end,IV_skew,PERMNO,week_start_right,weekly_return,week_end_right,TICKER,COMNAM,trading_days,sdate,edate
i64,date,date,f32,i64,date,f64,date,str,str,u32,str,str
5594,2021-11-08,2021-11-11,0.076602,52250,2021-11-15,-0.0016,2021-11-19,"""GENC""","""GENCOR INDUSTRIES INC""",5,"""1996-01-01""","""2024-12-31"""
5594,2021-11-15,2021-11-19,0.028996,52250,2021-11-22,-0.060095,2021-11-26,"""GENC""","""GENCOR INDUSTRIES INC""",4,"""1996-01-01""","""2024-12-31"""
5594,2021-11-22,2021-11-24,0.044062,52250,2021-11-29,-0.019608,2021-12-03,"""GENC""","""GENCOR INDUSTRIES INC""",5,"""1996-01-01""","""2024-12-31"""
6646,2020-10-26,2020-10-26,-0.037153,75672,2020-11-02,-0.068293,2020-11-06,"""WWR""","""WESTWATER RESOURCES INC""",5,"""1996-01-01""","""2024-12-31"""
6646,2020-11-16,2020-11-18,0.015387,75672,2020-11-23,-0.119448,2020-11-27,"""WWR""","""WESTWATER RESOURCES INC""",4,"""1996-01-01""","""2024-12-31"""


### 5.2 Add Fama-French 3 Factors


In [13]:
# Join with FF3 factors (matching the returns week)
print("Adding Fama-French 3 factors...")

merged_with_ff = (
    merged_df
    .join(
        ff3_factors_pl,
        left_on="week_start_right",  # The returns week
        right_on="week_start",
        how="left",
        suffix="_ff"
    )
)

print(f"✓ Added FF3 factors: {merged_with_ff.shape}")
print(f"  Columns: {merged_with_ff.columns}")
print(f"\nKey columns preview:")
merged_with_ff.select([
    'secid', 'week_start', 'week_start_right', 'IV_skew', 'weekly_return',
    'Mkt-RF', 'SMB', 'HML', 'RF'
]).head(10)


Adding Fama-French 3 factors...
✓ Added FF3 factors: (514768, 17)
  Columns: ['secid', 'week_start', 'week_end', 'IV_skew', 'PERMNO', 'week_start_right', 'weekly_return', 'week_end_right', 'TICKER', 'COMNAM', 'trading_days', 'sdate', 'edate', 'Mkt-RF', 'SMB', 'HML', 'RF']

Key columns preview:


secid,week_start,week_start_right,IV_skew,weekly_return,Mkt-RF,SMB,HML,RF
i64,date,date,f32,f64,f64,f64,f64,f64
5594,2021-11-08,2021-11-15,0.076602,-0.0016,-0.0015,-0.0181,-0.016,0.0
5594,2021-11-15,2021-11-22,0.028996,-0.060095,-0.0251,-0.0224,0.0266,0.0
5594,2021-11-22,2021-11-29,0.044062,-0.019608,-0.0213,-0.0162,0.013,0.0
6646,2020-10-26,2020-11-02,-0.037153,-0.068293,0.0769,0.0008,-0.0494,0.0
6646,2020-11-16,2020-11-23,0.015387,-0.119448,0.0285,0.0128,0.0131,0.0
6646,2020-12-07,2020-12-14,0.065453,-0.026157,0.0184,0.0199,-0.0351,0.0
6646,2020-12-14,2020-12-21,0.150233,0.109504,0.0015,0.0137,0.0009,0.0
6646,2020-12-21,2020-12-28,0.027616,-0.081937,0.0077,-0.0215,0.0114,0.0
6646,2020-12-28,2021-01-04,0.008089,0.040569,0.0251,0.035,0.0227,0.0
6646,2021-01-04,2021-01-11,0.172665,-0.025341,-0.0106,0.0231,0.0222,0.0


## 6. Data Quality Checks


In [14]:
# Check for missing values
print("Missing value analysis:")
missing_counts = {
    col: merged_with_ff[col].null_count() 
    for col in ['IV_skew', 'weekly_return', 'Mkt-RF', 'SMB', 'HML', 'RF']
}
for col, count in missing_counts.items():
    pct = 100 * count / len(merged_with_ff)
    print(f"  {col}: {count:,} ({pct:.2f}%)")

# Count complete cases
complete_cases = (
    merged_with_ff
    .filter(
        pl.col("IV_skew").is_not_null() &
        pl.col("weekly_return").is_not_null() &
        pl.col("Mkt-RF").is_not_null() &
        pl.col("SMB").is_not_null() &
        pl.col("HML").is_not_null() &
        pl.col("RF").is_not_null()
    )
)

print(f"\nComplete cases (no missing values): {len(complete_cases):,} ({100*len(complete_cases)/len(merged_with_ff):.2f}%)")


Missing value analysis:
  IV_skew: 0 (0.00%)
  weekly_return: 168 (0.03%)
  Mkt-RF: 168 (0.03%)
  SMB: 168 (0.03%)
  HML: 168 (0.03%)
  RF: 168 (0.03%)

Complete cases (no missing values): 514,600 (99.97%)


In [15]:
# Summary statistics for key variables
print("\nSummary Statistics:")
print("\nIV Skew:")
print(complete_cases['IV_skew'].describe())
print("\nWeekly Returns:")
print(complete_cases['weekly_return'].describe())
print("\nMkt-RF:")
print(complete_cases['Mkt-RF'].describe())



Summary Statistics:

IV Skew:
shape: (9, 2)
┌────────────┬───────────┐
│ statistic  ┆ value     │
│ ---        ┆ ---       │
│ str        ┆ f64       │
╞════════════╪═══════════╡
│ count      ┆ 514600.0  │
│ null_count ┆ 0.0       │
│ mean       ┆ 0.003419  │
│ std        ┆ 0.124955  │
│ min        ┆ -5.996383 │
│ 25%        ┆ -0.026517 │
│ 50%        ┆ -0.00973  │
│ 75%        ┆ 0.016505  │
│ max        ┆ 7.52255   │
└────────────┴───────────┘

Weekly Returns:
shape: (9, 2)
┌────────────┬───────────┐
│ statistic  ┆ value     │
│ ---        ┆ ---       │
│ str        ┆ f64       │
╞════════════╪═══════════╡
│ count      ┆ 514600.0  │
│ null_count ┆ 0.0       │
│ mean       ┆ 0.002673  │
│ std        ┆ 0.097926  │
│ min        ┆ -0.984303 │
│ 25%        ┆ -0.034167 │
│ 50%        ┆ 0.001325  │
│ 75%        ┆ 0.035584  │
│ max        ┆ 17.385022 │
└────────────┴───────────┘

Mkt-RF:
shape: (9, 2)
┌────────────┬──────────┐
│ statistic  ┆ value    │
│ ---        ┆ ---      │
│ str        

## 7. Save Final Dataset


In [16]:
# Save the final merged dataset
output_path = PROCESSED_DATA_DIR / 'merged_data_with_ff3.parquet'
merged_with_ff.write_parquet(output_path)

print(f"\n{'='*60}")
print("DATA LOADING COMPLETE")
print(f"{'='*60}")
print(f"\n✓ Final dataset saved to: {output_path}")
print(f"  Shape: {merged_with_ff.shape}")
print(f"  Size on disk: {os.path.getsize(output_path) / (1024**2):.2f} MB")
print(f"\n  Total observations: {len(merged_with_ff):,}")
print(f"  Complete cases: {len(complete_cases):,}")
print(f"  Unique securities: {merged_with_ff['secid'].n_unique()}")
print(f"\n  Columns: {merged_with_ff.columns}")
print(f"\n{'='*60}")
print("Ready for analysis!")
print(f"{'='*60}")



DATA LOADING COMPLETE

✓ Final dataset saved to: processed_data/merged_data_with_ff3.parquet
  Shape: (514768, 17)
  Size on disk: 6.83 MB

  Total observations: 514,768
  Complete cases: 514,600
  Unique securities: 3957

  Columns: ['secid', 'week_start', 'week_end', 'IV_skew', 'PERMNO', 'week_start_right', 'weekly_return', 'week_end_right', 'TICKER', 'COMNAM', 'trading_days', 'sdate', 'edate', 'Mkt-RF', 'SMB', 'HML', 'RF']

Ready for analysis!


## 8. Quick Data Preview


In [17]:
# Display final data structure
merged_with_ff.head(20)


secid,week_start,week_end,IV_skew,PERMNO,week_start_right,weekly_return,week_end_right,TICKER,COMNAM,trading_days,sdate,edate,Mkt-RF,SMB,HML,RF
i64,date,date,f32,i64,date,f64,date,str,str,u32,str,str,f64,f64,f64,f64
5594,2021-11-08,2021-11-11,0.076602,52250,2021-11-15,-0.0016,2021-11-19,"""GENC""","""GENCOR INDUSTRIES INC""",5,"""1996-01-01""","""2024-12-31""",-0.0015,-0.0181,-0.016,0.0
5594,2021-11-15,2021-11-19,0.028996,52250,2021-11-22,-0.060095,2021-11-26,"""GENC""","""GENCOR INDUSTRIES INC""",4,"""1996-01-01""","""2024-12-31""",-0.0251,-0.0224,0.0266,0.0
5594,2021-11-22,2021-11-24,0.044062,52250,2021-11-29,-0.019608,2021-12-03,"""GENC""","""GENCOR INDUSTRIES INC""",5,"""1996-01-01""","""2024-12-31""",-0.0213,-0.0162,0.013,0.0
6646,2020-10-26,2020-10-26,-0.037153,75672,2020-11-02,-0.068293,2020-11-06,"""WWR""","""WESTWATER RESOURCES INC""",5,"""1996-01-01""","""2024-12-31""",0.0769,0.0008,-0.0494,0.0
6646,2020-11-16,2020-11-18,0.015387,75672,2020-11-23,-0.119448,2020-11-27,"""WWR""","""WESTWATER RESOURCES INC""",4,"""1996-01-01""","""2024-12-31""",0.0285,0.0128,0.0131,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
6646,2021-03-01,2021-03-05,-0.001298,75672,2021-03-08,0.309623,2021-03-12,"""WWR""","""WESTWATER RESOURCES INC""",5,"""1996-01-01""","""2024-12-31""",0.0332,0.0462,0.0163,0.0
6646,2021-03-08,2021-03-08,0.003886,75672,2021-03-15,-0.086261,2021-03-19,"""WWR""","""WESTWATER RESOURCES INC""",5,"""1996-01-01""","""2024-12-31""",-0.0103,-0.0136,-0.0038,0.0
6646,2021-03-22,2021-03-26,-0.01896,75672,2021-03-29,0.055663,2021-04-01,"""WWR""","""WESTWATER RESOURCES INC""",4,"""1996-01-01""","""2024-12-31""",0.0138,0.0048,-0.0162,0.0
6646,2021-03-29,2021-03-30,0.050432,75672,2021-04-05,-0.049092,2021-04-09,"""WWR""","""WESTWATER RESOURCES INC""",5,"""1996-01-01""","""2024-12-31""",0.0246,-0.0257,-0.0131,0.0


In [18]:
# Sample of complete cases for verification
complete_cases.head(20)


secid,week_start,week_end,IV_skew,PERMNO,week_start_right,weekly_return,week_end_right,TICKER,COMNAM,trading_days,sdate,edate,Mkt-RF,SMB,HML,RF
i64,date,date,f32,i64,date,f64,date,str,str,u32,str,str,f64,f64,f64,f64
5594,2021-11-08,2021-11-11,0.076602,52250,2021-11-15,-0.0016,2021-11-19,"""GENC""","""GENCOR INDUSTRIES INC""",5,"""1996-01-01""","""2024-12-31""",-0.0015,-0.0181,-0.016,0.0
5594,2021-11-15,2021-11-19,0.028996,52250,2021-11-22,-0.060095,2021-11-26,"""GENC""","""GENCOR INDUSTRIES INC""",4,"""1996-01-01""","""2024-12-31""",-0.0251,-0.0224,0.0266,0.0
5594,2021-11-22,2021-11-24,0.044062,52250,2021-11-29,-0.019608,2021-12-03,"""GENC""","""GENCOR INDUSTRIES INC""",5,"""1996-01-01""","""2024-12-31""",-0.0213,-0.0162,0.013,0.0
6646,2020-10-26,2020-10-26,-0.037153,75672,2020-11-02,-0.068293,2020-11-06,"""WWR""","""WESTWATER RESOURCES INC""",5,"""1996-01-01""","""2024-12-31""",0.0769,0.0008,-0.0494,0.0
6646,2020-11-16,2020-11-18,0.015387,75672,2020-11-23,-0.119448,2020-11-27,"""WWR""","""WESTWATER RESOURCES INC""",4,"""1996-01-01""","""2024-12-31""",0.0285,0.0128,0.0131,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
6646,2021-03-01,2021-03-05,-0.001298,75672,2021-03-08,0.309623,2021-03-12,"""WWR""","""WESTWATER RESOURCES INC""",5,"""1996-01-01""","""2024-12-31""",0.0332,0.0462,0.0163,0.0
6646,2021-03-08,2021-03-08,0.003886,75672,2021-03-15,-0.086261,2021-03-19,"""WWR""","""WESTWATER RESOURCES INC""",5,"""1996-01-01""","""2024-12-31""",-0.0103,-0.0136,-0.0038,0.0
6646,2021-03-22,2021-03-26,-0.01896,75672,2021-03-29,0.055663,2021-04-01,"""WWR""","""WESTWATER RESOURCES INC""",4,"""1996-01-01""","""2024-12-31""",0.0138,0.0048,-0.0162,0.0
6646,2021-03-29,2021-03-30,0.050432,75672,2021-04-05,-0.049092,2021-04-09,"""WWR""","""WESTWATER RESOURCES INC""",5,"""1996-01-01""","""2024-12-31""",0.0246,-0.0257,-0.0131,0.0


### 2.2 Load Equity Returns Data
