In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# ===============================================================
# 0. Imports & helpers
# ===============================================================

import numpy as np, pandas as pd, scipy.optimize as opt, math, gc, pickle, pathlib
from tqdm.auto import tqdm
from catboost import CatBoostRegressor, Pool
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

pathlib.Path("cache").mkdir(exist_ok=True)

# ---------- SVI functions ----------
def svi_total_variance(p, k):
    a, b, rho, m, sig = p
    return a + b * (rho * (k - m) + np.sqrt((k - m) ** 2 + sig ** 2))

def svi_obj(p, k_obs, iv_obs):
    w = svi_total_variance(p, k_obs)
    if (w <= 0).any(): 
        return np.inf
    return ((np.sqrt(w) - iv_obs) ** 2).sum()

def calibrate_svi(strikes, ivs, fpx):
    if len(strikes) < 5:               # not enough pts
        return None
    k = np.log(strikes / fpx)
    x0 = np.array([ max(ivs.min()**2*0.8,1e-6), 0.1, 0.0, 0.0, 0.1 ])
    bnds = [(1e-8,np.inf),(1e-8,np.inf),(-.999,.999),(k.min()-.5,k.max()+.5),(1e-8,5)]
    try:
        res = opt.minimize(svi_obj, x0, args=(k, ivs), bounds=bnds,
                           method="L-BFGS-B",
                           options={"maxiter":2000,"ftol":1e-12})
        return None if (not res.success) else res.x
    except Exception:                  # failed fit
        return None

def svi_iv(p, strikes, fpx):
    if p is None: 
        return np.full(len(strikes), np.nan)
    k = np.log(strikes / fpx)
    w = np.maximum(svi_total_variance(p, k), 1e-14)
    return np.sqrt(w)

# ---------- 30-min block helper ----------
def assign_blocks(ts_ns, block_sec=30*60):
    ts_s = (ts_ns.astype(np.int64) // 10**9).astype(int)
    return (ts_s // block_sec).astype(int)

# ===============================================================
# 1. Melt wide → long   (cached)
# ===============================================================

if (p := pathlib.Path("cache/train_long_raw.parquet")).exists():
    train_long = pd.read_parquet(p)
    print("✓ loaded melted train_long from cache.")
else:
    train_wide = pd.read_parquet("/kaggle/input/nk-iv-prediction/train_data.parquet")
    iv_cols = [c for c in train_wide.columns if c.startswith(("call_iv_","put_iv_"))]
    calls = [c for c in iv_cols if c.startswith("call_iv_")]
    puts  = [c for c in iv_cols if c.startswith("put_iv_")]

    def melt(df, cols, cp_flag, prefix):
        tmp = df.melt(
            id_vars=["timestamp","underlying","expiry"]+[f"X{i}" for i in range(42)],
            value_vars=cols, var_name="var", value_name="iv"
        )
        tmp["cp_flag"] = cp_flag
        tmp["strike"]  = tmp["var"].str.replace(prefix,"").astype(int)
        return tmp.drop(columns="var")

    train_long = pd.concat(
        [melt(train_wide, calls, 0,"call_iv_"),
         melt(train_wide, puts , 1,"put_iv_")],
        ignore_index=True
    )
    train_long.to_parquet(p, index=False)
    print("✓ melted train_long written to cache/train_long_raw.parquet")

# ===============================================================
# 2. SVI calibration per snapshot  (cached)
# ===============================================================

param_file = pathlib.Path("cache/svi_params.pkl")
svi_file   = pathlib.Path("cache/train_with_svi.parquet")

if svi_file.exists() and param_file.exists():
    train_long = pd.read_parquet(svi_file)
    params_dict = pickle.loads(param_file.read_bytes())
    print("✓ loaded iv_svi & params from cache.")
else:
    params_dict = {}
    train_long["iv_svi"] = np.nan

    grp_cols = ["timestamp","expiry","underlying"]
    for key, sub in tqdm(train_long.groupby(grp_cols, sort=False),
                         total=train_long.groupby(grp_cols).ngroups,
                         desc="SVI fits"):
        p = calibrate_svi(sub["strike"].values[~sub["iv"].isna()],
                          sub["iv"].values[~sub["iv"].isna()],
                          key[2])
        params_dict[key] = p
        train_long.loc[sub.index, "iv_svi"] = svi_iv(p, sub["strike"].values, key[2])

    train_long.to_parquet(svi_file, index=False)
    param_file.write_bytes(pickle.dumps(params_dict))
    print("✓ iv_svi + params cached to disk.")

# ===============================================================
# 3. Feature engineering   (cheap, no need to cache)
# ===============================================================

train_long["log_mny"]  = np.log(train_long["strike"] / train_long["underlying"])
train_long["abs_mny"]  = train_long["log_mny"].abs()
train_long["log_mny_sq"] = train_long["log_mny"]**2
train_long["log_mny_cu"] = train_long["log_mny"]**3
train_long["abs_strike_minus_underlying"] = (train_long["strike"] - train_long["underlying"]).abs()
train_long["cpflag_x_absmny"] = train_long["cp_flag"] * train_long["abs_mny"]

base_feats = [
    "log_mny","abs_mny","log_mny_sq","log_mny_cu",
    "abs_strike_minus_underlying","cpflag_x_absmny",
    "iv_svi","cp_flag"
] + [f"X{i}" for i in range(42)]

train_fit = train_long[train_long["iv"].notna()].reset_index(drop=True)
train_fit["residual"] = train_fit["iv"] - train_fit["iv_svi"]

# ===============================================================
# 4. CatBoost 5-fold CV  (cached)
# ===============================================================

oof_file = pathlib.Path("cache/oof_cb.npy")
model_dir = pathlib.Path("cache/cat_models")
model_dir.mkdir(exist_ok=True)

if oof_file.exists() and all((model_dir/f"fold{i}.cbm").exists() for i in range(5)):
    oof_cb = np.load(oof_file)
    cat_models = [CatBoostRegressor().load_model(model_dir/f"fold{i}.cbm") for i in range(5)]
    print("✓ CatBoost OOF & models loaded from cache.")
else:
    cat_models, oof_cb = [], np.zeros(len(train_fit))
    train_fit["block_id"] = assign_blocks(train_fit["timestamp"])
    blocks = np.sort(train_fit["block_id"].unique())
    nfold, step = 5, math.ceil(len(blocks)/5)

    for f in range(nfold):
        val_blocks = blocks[f*step:(f+1)*step]
        val_mask   = train_fit["block_id"].isin(val_blocks)

        X_tr, y_tr = train_fit.loc[~val_mask, base_feats], train_fit.loc[~val_mask,"residual"]
        X_va, y_va = train_fit.loc[ val_mask, base_feats], train_fit.loc[ val_mask,"residual"]
        X_tr["cp_flag"] = X_tr["cp_flag"].astype(int)
        X_va["cp_flag"] = X_va["cp_flag"].astype(int)

        cb = CatBoostRegressor(
            iterations=2000, depth=6, learning_rate=0.05, l2_leaf_reg=3.0,
            eval_metric="RMSE", task_type="GPU", random_seed=42,
            early_stopping_rounds=200, verbose=200)
        cb.fit(Pool(X_tr,y_tr,cat_features=["cp_flag"]),
               eval_set=Pool(X_va,y_va,cat_features=["cp_flag"]))

        oof_cb[val_mask.values] = cb.predict(X_va)
        cb.save_model(model_dir/f"fold{f}.cbm")
        cat_models.append(cb)
        gc.collect()

    np.save(oof_file, oof_cb)
    print("✓ CatBoost OOF RMSE:", mean_squared_error(train_fit["residual"], oof_cb, squared=False))

# ===============================================================
# 5. LightGBM stack  (cached)
# ===============================================================

oof_lgb_file  = pathlib.Path("cache/oof_lgb.npy")
lgb_dir       = pathlib.Path("cache/lgb_models")
lgb_dir.mkdir(exist_ok=True)

if oof_lgb_file.exists() and all((lgb_dir/f"fold{i}.txt").exists() for i in range(5)):
    oof_lgb   = np.load(oof_lgb_file)
    lgb_models = [lgb.Booster(model_file=str(lgb_dir/f"fold{i}.txt")) for i in range(5)]
    print("✓ LightGBM OOF & models loaded from cache.")
else:
    train_fit["Z_cb"] = oof_cb
    lgb_feats = base_feats + ["Z_cb"]

    oof_lgb, lgb_models = np.zeros(len(train_fit)), []
    blocks = np.sort(train_fit["block_id"].unique())
    nfold, step = 5, math.ceil(len(blocks)/5)

    for f in range(nfold):
        val_blocks = blocks[f*step:(f+1)*step]
        val_mask   = train_fit["block_id"].isin(val_blocks)

        X_tr, y_tr = train_fit.loc[~val_mask, lgb_feats], train_fit.loc[~val_mask,"residual"]
        X_va, y_va = train_fit.loc[ val_mask, lgb_feats], train_fit.loc[ val_mask,"residual"]
        for c in ["cp_flag"]:                   # cast cat
            X_tr[c] = X_tr[c].astype(int); X_va[c] = X_va[c].astype(int)

        dtr = lgb.Dataset(X_tr, y_tr, categorical_feature=["cp_flag"])
        dva = lgb.Dataset(X_va, y_va, categorical_feature=["cp_flag"])

        params = dict(objective="regression", metric="rmse",
                      boosting="gbdt", learning_rate=0.03,
                      num_leaves=64, feature_fraction=0.8,
                      bagging_fraction=0.8, bagging_freq=5,
                      lambda_l1=0.5, lambda_l2=1.0, seed=42)
        bst = lgb.train(params, dtr, 5000, [dva], verbose_eval=200,
                        early_stopping_rounds=200)
        oof_lgb[val_mask.values] = bst.predict(X_va, num_iteration=bst.best_iteration)
        bst.save_model(lgb_dir/f"fold{f}.txt")
        lgb_models.append(bst)

    np.save(oof_lgb_file, oof_lgb)
    print("✓ CatBoost+LightGBM OOF RMSE:",
          mean_squared_error(train_fit["residual"], oof_lgb, squared=False))

# ===============================================================
# 6. Inference helper (uses cached models)
# ===============================================================

def predict_iv_for_test(test_parquet, out_csv):
    print(">>> inference …")
    test_wide = pd.read_parquet(test_parquet)

    # ----- melt -----
    melt = lambda df, cols, flag, prefix: df.melt(
        id_vars=["timestamp","underlying","expiry"]+[f"X{i}" for i in range(42)],
        value_vars=cols, var_name="var", value_name="dummy"
    ).assign(cp_flag=flag,
             strike=lambda d: d["var"].str.replace(prefix,"").astype(int)).drop(columns="var")

    tl = pd.concat([
        melt(test_wide, [c for c in test_wide.columns if c.startswith("call_iv_")],0,"call_iv_"),
        melt(test_wide, [c for c in test_wide.columns if c.startswith("put_iv_") ],1,"put_iv_")
    ], ignore_index=True)

    # ----- SVI baseline -----
    tl["iv_svi"] = np.nan
    for key, sub in tqdm(tl.groupby(["timestamp","expiry","underlying"], sort=False),
                         total=tl.groupby(["timestamp","expiry","underlying"]).ngroups,
                         desc="SVI (test)"):
        p = params_dict.get(key)
        tl.loc[sub.index,"iv_svi"] = svi_iv(p, sub["strike"].values, key[2])

    # ----- same features -----
    tl["log_mny"] = np.log(tl["strike"]/tl["underlying"])
    tl["abs_mny"] = tl["log_mny"].abs()
    tl["log_mny_sq"] = tl["log_mny"]**2
    tl["log_mny_cu"] = tl["log_mny"]**3
    tl["abs_strike_minus_underlying"] = (tl["strike"]-tl["underlying"]).abs()
    tl["cpflag_x_absmny"] = tl["cp_flag"]*tl["abs_mny"]

    X_cat = tl[base_feats].copy()
    X_cat["cp_flag"] = X_cat["cp_flag"].astype(int)
    Zcb = sum(m.predict(X_cat) for m in cat_models)/len(cat_models)

    X_lgb = X_cat.copy(); X_lgb["Z_cb"] = Zcb
    Zlgb = sum(m.predict(X_lgb, num_iteration=m.best_iteration) for m in lgb_models)/len(lgb_models)

    tl["iv_pred"] = tl["iv_svi"] + Zlgb

    # ----- pivot back -----
    out = test_wide.copy()
    for (ts,exp,und), sub in tqdm(tl.groupby(["timestamp","expiry","underlying"], sort=False),
                                  desc="pivot"):
        m = sub.assign(lbl=sub["cp_flag"].map({0:"call_iv_",1:"put_iv_"})+sub["strike"].astype(str))
        out.loc[(out["timestamp"]==ts)&(out["expiry"]==exp)&(out["underlying"]==und),
                m["lbl"]] = m["iv_pred"].values

    out.to_csv(out_csv, index=False)
    print("✓ submission saved ->", out_csv)

# ===============================================================
# 7. Example usage
# ===============================================================
# predict_iv_for_test("/kaggle/input/nk-iv-prediction/test_data.parquet",
#                     "submission.csv")

## 1. Imports and Utility Functions

In [None]:
TRAINING_DATA = "/kaggle/input/nk-iv-prediction/train_data.parquet"
TESTING_DATA = "/kaggle/input/nk-iv-prediction/test_data.parquet"
SAMPLE_SUBMISSION_DATA = "/kaggle/input/nk-iv-prediction/sample_submission.csv"

In [None]:
import numpy as np
import pandas as pd
import scipy.optimize as opt
from tqdm.auto import tqdm

# ─────────────────────────────────────────────────────────────────
# 1.1  Raw SVI total‐variance function and objective
# ─────────────────────────────────────────────────────────────────

def svi_total_variance(params, k_vals):
    """
    Raw SVI total variance:
        w(k) = a + b * [ rho*(k - m) + sqrt((k - m)^2 + sigma^2 ) ].
    params = [a, b, rho, m, sigma]
    k_vals = array of log-moneyness
    Returns array w(k_vals)
    """
    a, b, rho, m, sig = params
    return a + b * (rho * (k_vals - m) + np.sqrt((k_vals - m)**2 + sig**2))


def svi_obj_on_iv(params, k_obs, iv_obs):
    """
    Objective (sum of squared errors on IV) for a single row:
       minimize Σ_i [ iv_obs_i - sqrt( w(k_i) ) ]^2,
    where w(k) = svi_total_variance(params, k).
    """
    w = svi_total_variance(params, k_obs)
    # enforce positive total variance
    if np.any(w <= 0):
        return np.inf
    iv_model = np.sqrt(w)
    return np.sum((iv_model - iv_obs)**2)


# ─────────────────────────────────────────────────────────────────
# 1.2  Per‐row SVI calibration (calls OR puts)
# ─────────────────────────────────────────────────────────────────

def calibrate_svi_for_row(strikes, ivs, underlying):
    """
    Fit SVI for a *single row*'s subset of strikes & IVs.
    Inputs:
      - strikes:  array of strike prices where IV is known
      - ivs:      array of corresponding observed IVs
      - underlying: scalar underlying price for that row
    Returns:
      params (a,b,rho,m,sigma) if fit succeeds; else None
    """
    # If too few points, bail out
    if len(strikes) < 5:
        return None

    # Compute log-moneyness k_i = ln(K / underlying)
    k_obs = np.log(strikes / underlying)
    iv_obs = ivs.astype(float)

    # Initial guess for [a,b,rho,m,sigma]
    # - a0: roughly minimum total variance = (min iv)^2
    v_min = np.min(iv_obs**2)
    a0 = max(v_min * 0.8, 1e-6)  # a small positive floor
    b0 = 0.1                     # small positive slope
    rho0 = 0.0
    m0 = 0.0                     # assume roughly ATM center
    sig0 = 0.1
    x0 = np.array([a0, b0, rho0, m0, sig0], dtype=float)

    # Bounds: a>0, b>0, |rho|<0.999, m in [min(k)-0.5, max(k)+0.5], sigma>0
    bounds = [
        (1e-8, np.inf),                    # a
        (1e-8, np.inf),                    # b
        (-0.999, 0.999),                   # rho
        (np.min(k_obs)-0.5, np.max(k_obs)+0.5),  # m
        (1e-8, 5.0)                        # sigma
    ]

    try:
        res = opt.minimize(
            fun=lambda x: svi_obj_on_iv(x, k_obs, iv_obs),
            x0=x0,
            bounds=bounds,
            method="L-BFGS-B",
            options={"maxiter":200, "ftol":1e-8}
        )
        if (not res.success) or np.any(res.x < 0):
            return None
        return res.x  # fitted [a,b,rho,m,sigma]
    except:
        return None


def fill_svi_smile(params, target_strikes, underlying):
    """
    Given fitted SVI params = [a,b,rho,m,sigma], return IVs at target_strikes:
      iv_pred = sqrt( w(k) )  where  k = ln(K/underlying).
    If params is None, returns NaNs.
    """
    if params is None:
        return np.full(len(target_strikes), np.nan)
    k_all = np.log(target_strikes / underlying)
    w_all = svi_total_variance(params, k_all)
    # ensure no negative variances
    w_all = np.maximum(w_all, 0.0)
    return np.sqrt(w_all)


## 2. Load Test Data & Sample Submission

In [None]:
# 2.1  Which columns correspond to calls and puts?
#     We will need to parse the column names like "call_iv_26000" → strike=26000.

test_df = pd.read_parquet(TESTING_DATA)
sample_sub = pd.read_csv(SAMPLE_SUBMISSION_DATA)

# Extract the full list of IV‐columns (they appear in sample_submission in order).
iv_columns = [c for c in sample_sub.columns if ("call_iv_" in c) or ("put_iv_" in c)]

# For convenience, build separate sorted lists of all possible strikes:
call_cols = [c for c in iv_columns if c.startswith("call_iv_")]
put_cols  = [c for c in iv_columns if c.startswith("put_iv_")]

# Extract strike values as integers:
call_strikes = sorted(int(c.replace("call_iv_","")) for c in call_cols)
put_strikes  = sorted(int(c.replace("put_iv_","")) for c in put_cols)

# Map column names → strike (dict):
call_col_to_strike = {f"call_iv_{k}": k for k in call_strikes}
put_col_to_strike  = {f"put_iv_{k}":  k for k in put_strikes}

print("Total test rows:", len(test_df))
print("Number of call strikes:", len(call_strikes), "→", call_strikes[:5], "…", call_strikes[-5:])
print("Number of put strikes: ", len(put_strikes),  "→", put_strikes[:5],  "…", put_strikes[-5:])
print(sample_sub.columns.tolist())

## 3. Row-wise Imputation Loop (calls + puts)

In [None]:
# Prepare an output array of shape (n_rows, n_iv_columns)
n_rows = len(test_df)
n_ivcols = len(iv_columns)

# We'll build a DataFrame exactly matching sample_sub’s shape/order:
out = pd.DataFrame(index=np.arange(n_rows), columns=sample_sub.columns, dtype=float)
out["timestamp"] = sample_sub["timestamp"].values

# To show progress:
pbar = tqdm(total=n_rows, desc="Imputing rows")

for i, row in test_df.iterrows():
    underlying = float(row["underlying"])  # NIFTY50 spot price
    
    # ────────────────────────────────────────────
    # 3.1  Calls: gather observed call IVs for this row
    # ────────────────────────────────────────────
    observed_call_strikes = []
    observed_call_ivs     = []
    missing_call_strikes  = []
    
    for colname, strike in call_col_to_strike.items():
        val = row[colname]
        if np.isfinite(val):
            observed_call_strikes.append(strike)
            observed_call_ivs.append(val)
        else:
            missing_call_strikes.append(strike)
    
    observed_call_strikes = np.array(observed_call_strikes, dtype=float)
    observed_call_ivs     = np.array(observed_call_ivs, dtype=float)
    missing_call_strikes  = np.array(missing_call_strikes, dtype=float)
    
    # Fit SVI on calls
    params_call = calibrate_svi_for_row(observed_call_strikes, observed_call_ivs, underlying)
    
    # Predict missing calls
    if params_call is not None:
        call_preds = fill_svi_smile(params_call, missing_call_strikes, underlying)
    else:
        # Fallback: simple linear interpolation in k‐space
        if len(observed_call_strikes) >= 2:
            k_obs = np.log(observed_call_strikes / underlying)
            iv_obs = observed_call_ivs
            k_miss = np.log(missing_call_strikes / underlying)
            # np.interp on k_miss (left/right are filled with nearest)
            iv_pred = np.interp(k_miss, k_obs, iv_obs)
            call_preds = iv_pred
        else:
            # Too few points → fill with a constant (e.g. nearest neighbor)
            call_preds = np.full(len(missing_call_strikes), observed_call_ivs.mean() if len(observed_call_ivs)>0 else 0.2)
    
    # Write back into output DataFrame
    #   - For observed indices: keep the original value
    for colname, strike in call_col_to_strike.items():
        if not np.isfinite(row[colname]):
            # find index in missing_call_strikes
            idx = np.where(missing_call_strikes == strike)[0][0]
            out.at[i, colname] = call_preds[idx]
        else:
            out.at[i, colname] = row[colname]
    
    # ────────────────────────────────────────────
    # 3.2  Puts: gather observed put IVs for this row
    # ────────────────────────────────────────────
    observed_put_strikes = []
    observed_put_ivs     = []
    missing_put_strikes  = []
    
    for colname, strike in put_col_to_strike.items():
        val = row[colname]
        if np.isfinite(val):
            observed_put_strikes.append(strike)
            observed_put_ivs.append(val)
        else:
            missing_put_strikes.append(strike)
    
    observed_put_strikes = np.array(observed_put_strikes, dtype=float)
    observed_put_ivs     = np.array(observed_put_ivs, dtype=float)
    missing_put_strikes  = np.array(missing_put_strikes, dtype=float)
    
    # Fit SVI on puts
    params_put = calibrate_svi_for_row(observed_put_strikes, observed_put_ivs, underlying)
    
    # Predict missing puts
    if params_put is not None:
        put_preds = fill_svi_smile(params_put, missing_put_strikes, underlying)
    else:
        # Fallback: linear interpolation in k‐space
        if len(observed_put_strikes) >= 2:
            k_obs = np.log(observed_put_strikes / underlying)
            iv_obs = observed_put_ivs
            k_miss = np.log(missing_put_strikes / underlying)
            iv_pred = np.interp(k_miss, k_obs, iv_obs)
            put_preds = iv_pred
        else:
            put_preds = np.full(len(missing_put_strikes), observed_put_ivs.mean() if len(observed_put_ivs)>0 else 0.2)
    
    # Write back into output DataFrame
    for colname, strike in put_col_to_strike.items():
        if not np.isfinite(row[colname]):
            idx = np.where(missing_put_strikes == strike)[0][0]
            out.at[i, colname] = put_preds[idx]
        else:
            out.at[i, colname] = row[colname]
    
    pbar.update(1)
    print("imputed row ",i)

pbar.close()

## 4. Sanity‐check & Export Submission 

In [None]:
# 4.1  Quick sanity: check no NaNs remain
assert out.isna().sum().sum() == 0, "There are still NaNs in the output!"

# 4.2  Make sure the column order matches sample_submission
out = out[sample_sub.columns]

# 4.3  Write submission.csv
out.to_csv("submission.csv", index=False)
print("Wrote submission.csv with shape", out.shape)

In [24]:
import numpy as np
import pandas as pd

# 6.1  Read the sample submission (with timestamp + IV columns) and our submission
sample_sub = pd.read_csv(SAMPLE_SUBMISSION_DATA)
sub        = pd.read_csv("submission.csv")

# 6.2  Check that shape and column order match exactly
assert sub.shape == sample_sub.shape, f"Shape mismatch: {sub.shape} vs {sample_sub.shape}"
assert list(sub.columns) == list(sample_sub.columns), "Column order or names mismatch!"

# 6.3  Check for any NaNs in our submission
assert sub.isna().sum().sum() == 0, "Submission has NaNs!"

# 6.4  Extract the list of IV columns (everything except 'timestamp')
iv_columns = [c for c in sub.columns if c != "timestamp"]

# 6.5  Confirm that for every IV column, any originally‐observed IV in test_df was preserved exactly
#      → We need test_df loaded in memory (with the same ordering as sub)
test_df = pd.read_parquet(TESTING_DATA)

# Mask of “observed” IV entries in test_df
mask_obs = test_df[iv_columns].notna()

# Compare only those entries:
pred_vals = sub[iv_columns].values
true_vals = test_df[iv_columns].values

diff = pred_vals[mask_obs.values] - true_vals[mask_obs.values]
assert np.allclose(diff, 0.0, atol=1e-12), "Some observed IVs were changed!"

# 6.6  Compute and print RMSE over **only** the observed cells:
rmse_observed = np.sqrt(np.mean((diff) ** 2))
print(f"Observed‐cell RMSE (should be 0): {rmse_observed:.12f}")

# 6.7  Compute and print RMSE over the **filled** (formerly‐masked) cells:
mask_filled = ~mask_obs.values  # True where test_df was NaN
filled_diff = pred_vals[mask_filled]  # no ground truth for these, so we skip
# We cannot compute “true” for masked cells (they’re NaN), so we skip RMSE here.
# If you have a hold‐out on train, compute that separately.

# 6.8  Spot‐check that all IVs are within a reasonable range [0, 5]:
iv_data = sub[iv_columns].values.flatten()
assert (iv_data >= 0.0).all() and (iv_data <= 5.0).all(), "Some IVs are out of a reasonable range!"

print("✔ Sanity check passed: shape, columns, no NaNs, observed IVs untouched, IV range OK.")

Observed‐cell RMSE (should be 0): 0.000000000000
✔ Sanity check passed: shape, columns, no NaNs, observed IVs untouched, IV range OK.


## Flattened long-form transformation

In [26]:
import pandas as pd

# 1. Read the original wide-format train.parquet
train = pd.read_parquet(TRAINING_DATA)

# 2. Identify all IV columns for calls and for puts
iv_cols = [c for c in train.columns if c.startswith("call_iv_") or c.startswith("put_iv_")]
call_cols = [c for c in iv_cols if c.startswith("call_iv_")]
put_cols  = [c for c in iv_cols if c.startswith("put_iv_")]

# 3. Melt calls into long form:
#    Each row: timestamp, underlying, expiry, X0–X41, var="call_iv_K", iv=value, cp_flag=0, strike=K
calls_long = train.melt(
    id_vars=["timestamp", "underlying", "expiry"] + [f"X{i}" for i in range(42)],
    value_vars=call_cols,
    var_name="var", value_name="iv"
)
calls_long["cp_flag"] = 0  # calls
calls_long["strike"] = calls_long["var"].str.replace("call_iv_", "").astype(int)
calls_long = calls_long.drop(columns="var")

# 4. Melt puts into long form similarly:
puts_long = train.melt(
    id_vars=["timestamp", "underlying", "expiry"] + [f"X{i}" for i in range(42)],
    value_vars=put_cols,
    var_name="var", value_name="iv"
)
puts_long["cp_flag"] = 1  # puts
puts_long["strike"] = puts_long["var"].str.replace("put_iv_", "").astype(int)
puts_long = puts_long.drop(columns="var")

# 5. Concatenate calls_long and puts_long into a single DataFrame
train_long = pd.concat([calls_long, puts_long], ignore_index=True)

# 6. Reset index and inspect
train_long = train_long.reset_index(drop=True)
print("Long‐form train shape:", train_long.shape)
print(train_long.head())


Long‐form train shape: (9273680, 48)
             timestamp  underlying      expiry        X0        X1  \
0  1745296089000000000     24160.9  2025-04-24 -0.092103 -0.084458   
1  1745304077000000000     24188.1  2025-04-24 -0.013699 -0.023263   
2  1745313495000000000     24148.6  2025-04-24 -0.395427 -0.056440   
3  1745313499000000000     24147.4  2025-04-24  0.007829 -0.086614   
4  1745313608000000000     24155.9  2025-04-24  0.012404 -0.005619   

             X2        X3            X4        X5        X6  ...       X35  \
0  1.025842e+05  0.001655 -1.379624e+06  0.027959 -0.020240  ...  0.024715   
1  1.086423e+05 -0.004734  2.528508e+06 -0.006439 -0.011416  ... -0.004020   
2 -1.194717e+06  0.005011 -1.185146e+08 -0.027625 -0.002189  ... -0.035342   
3  5.936540e+05  0.001347  3.826919e+06 -0.052881 -0.015226  ... -0.045472   
4  2.419948e+05 -0.001172  4.596446e+06  0.039326  0.015934  ...  0.044814   

        X36       X37           X38           X39            X40  \
0  0.