In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import root_mean_squared_error as rmse

# Load the data
train_data = pd.read_parquet("data/train_data.parquet")
test_data = pd.read_parquet("data/test_data.parquet")

# Convert expiry to datetime if it's not already
if train_data["expiry"].dtype != "datetime64[ns]":
    train_data["expiry"] = pd.to_datetime(train_data["expiry"])

# Get the target expiry date
target_date = pd.Timestamp("2025-05-08").date()


In [3]:
data_path = "data"
output_path = "output/"
temp_path = "tmp/"

sample_submission = pd.read_csv(f"{data_path}/sample_submission.csv")
submission_cols = sample_submission.columns.tolist()

In [5]:
# Filter rows with the specified expiry date
expiry_filter = train_data['expiry'].dt.date == target_date
target_rows = train_data[expiry_filter]

# Get indices of rows to be used for validation (50% of the rows with target expiry)
validation_indices = target_rows.sample(frac=0.5, random_state=43).index

# Create validation set
val_data = train_data.loc[validation_indices].copy()

# Remove validation data from training set
train_data = train_data.drop(validation_indices)

# Print shapes to confirm
print(f"Original training data shape: {len(train_data) + len(val_data)}")
print(f"New training data shape: {train_data.shape}")
print(f"Validation data shape: {val_data.shape}")
print(f"Test data shape: {test_data.shape}")

Original training data shape: 178340
New training data shape: (138752, 97)
Validation data shape: (39588, 97)
Test data shape: (12065, 96)


In [6]:
test_data.columns

Index(['timestamp', 'underlying', 'call_iv_24000', 'call_iv_24100',
       'call_iv_24200', 'call_iv_24300', 'call_iv_24400', 'call_iv_24500',
       'call_iv_24600', 'call_iv_24700', 'call_iv_24800', 'call_iv_24900',
       'call_iv_25000', 'call_iv_25100', 'call_iv_25200', 'call_iv_25300',
       'call_iv_25400', 'call_iv_25500', 'call_iv_25600', 'call_iv_25700',
       'call_iv_25800', 'call_iv_25900', 'call_iv_26000', 'call_iv_26100',
       'call_iv_26200', 'call_iv_26300', 'call_iv_26400', 'call_iv_26500',
       'put_iv_23000', 'put_iv_23100', 'put_iv_23200', 'put_iv_23300',
       'put_iv_23400', 'put_iv_23500', 'put_iv_23600', 'put_iv_23700',
       'put_iv_23800', 'put_iv_23900', 'put_iv_24000', 'put_iv_24100',
       'put_iv_24200', 'put_iv_24300', 'put_iv_24400', 'put_iv_24500',
       'put_iv_24600', 'put_iv_24700', 'put_iv_24800', 'put_iv_24900',
       'put_iv_25000', 'put_iv_25100', 'put_iv_25200', 'put_iv_25300',
       'put_iv_25400', 'put_iv_25500', 'X0', 'X1', 'X2',

In [7]:
train_data.columns

Index(['timestamp', 'underlying', 'expiry', 'call_iv_23500', 'call_iv_23600',
       'call_iv_23700', 'call_iv_23800', 'call_iv_23900', 'call_iv_24000',
       'call_iv_24100', 'call_iv_24200', 'call_iv_24300', 'call_iv_24400',
       'call_iv_24500', 'call_iv_24600', 'call_iv_24700', 'call_iv_24800',
       'call_iv_24900', 'call_iv_25000', 'call_iv_25100', 'call_iv_25200',
       'call_iv_25300', 'call_iv_25400', 'call_iv_25500', 'call_iv_25600',
       'call_iv_25700', 'call_iv_25800', 'call_iv_25900', 'call_iv_26000',
       'put_iv_22500', 'put_iv_22600', 'put_iv_22700', 'put_iv_22800',
       'put_iv_22900', 'put_iv_23000', 'put_iv_23100', 'put_iv_23200',
       'put_iv_23300', 'put_iv_23400', 'put_iv_23500', 'put_iv_23600',
       'put_iv_23700', 'put_iv_23800', 'put_iv_23900', 'put_iv_24000',
       'put_iv_24100', 'put_iv_24200', 'put_iv_24300', 'put_iv_24400',
       'put_iv_24500', 'put_iv_24600', 'put_iv_24700', 'put_iv_24800',
       'put_iv_24900', 'put_iv_25000', 'X0', '

In [8]:
pred_cols = list(
    filter(
        lambda x: x.startswith("call") or x.startswith("put"), val_data.columns.tolist()
    )
)
len(pred_cols)

52

## Setting 32 columns to nan randomly in each row

In [9]:
# Create a copy of val_data to avoid modifying the original
sample_val = val_data.copy()

# Get the call and put columns separately
call_cols = [col for col in pred_cols if col.startswith('call')]
put_cols = [col for col in pred_cols if col.startswith('put')]

# For each row, randomly select 16 call columns and 16 put columns to set to NaN
for idx in sample_val.index:
  # Randomly select 16 call columns for this row
  nan_call_cols = np.random.choice(call_cols, size=16, replace=False)
  
  # Randomly select 16 put columns for this row
  nan_put_cols = np.random.choice(put_cols, size=16, replace=False)
  
  # Set these columns to NaN for this row
  sample_val.loc[idx, nan_call_cols] = np.nan
  sample_val.loc[idx, nan_put_cols] = np.nan

# Create val_Y from the original validation data, containing only the target columns
val_Y = val_data[pred_cols]

In [10]:
sample_val

Unnamed: 0,timestamp,underlying,expiry,call_iv_23500,call_iv_23600,call_iv_23700,call_iv_23800,call_iv_23900,call_iv_24000,call_iv_24100,...,X32,X33,X34,X35,X36,X37,X38,X39,X40,X41
158881,1746590742000000000,24433.0,2025-05-08,,,0.383775,,,,,...,-0.022656,0.006262,2.553592e+06,-0.001931,2.372125,0.068108,-1.522534e+06,-3.048516e+06,1.254699e+06,-0.000000e+00
117518,1746176782000000000,24285.5,2025-05-08,,,0.187751,0.178467,,,0.154975,...,0.003836,-0.004734,6.693594e+05,0.010055,1.315715,0.030237,5.235704e+04,-8.124241e+06,1.313372e+06,-1.000000e-06
155403,1746520665000000000,24387.5,2025-05-08,0.288416,,0.245484,,0.199588,,,...,0.001037,0.003309,2.847738e+06,0.008796,0.648142,0.041674,3.459928e+05,-2.764208e+07,2.203250e+06,0.000000e+00
159498,1746591359000000000,24384.6,2025-05-08,,,,,,,,...,0.035097,-0.030470,-3.629051e+06,0.146150,-8.941554,1.278703,6.211531e+06,7.623811e+06,-5.587995e+05,3.208558e+06
106693,1746165957000000000,24329.3,2025-05-08,,,,,,0.173841,0.166765,...,0.001900,0.002318,-1.051215e+06,0.026776,0.242156,0.025405,2.716713e+06,-3.698245e+06,-0.000000e+00,-0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150469,1746515731000000000,24393.6,2025-05-08,0.287421,0.266833,,,0.199508,,,...,-0.008578,-0.004349,7.608529e+05,0.022286,0.276633,-0.049681,-3.766206e+05,3.310812e+06,0.000000e+00,0.000000e+00
178012,1746609873000000000,24362.1,2025-05-08,,,,,0.293386,0.255666,,...,0.006081,-0.001241,-5.956005e+06,-0.053984,-2.637237,0.118681,-7.248843e+05,-4.210857e+06,-3.790669e+06,0.000000e+00
114201,1746173465000000000,24415.4,2025-05-08,,0.207705,,0.188346,,,,...,0.016044,0.013809,-1.940570e+05,-0.009913,1.074973,0.016054,-3.876233e+05,4.172262e+06,0.000000e+00,1.000000e-06
133136,1746431799000000000,24450.5,2025-05-08,0.255138,0.239076,0.223819,,0.190411,0.172838,,...,-0.008472,0.003098,3.785463e+05,0.001317,0.374664,0.009704,-6.879767e+06,-6.470817e+06,0.000000e+00,0.000000e+00


## Cosh Regresssion

In [11]:
from scipy.optimize import curve_fit


    # Create a function to fit cosh regression and predict missing values
def fit_cosh_curve(row, call_cols, put_cols):
        # Separate call and put columns
        call_values = row[call_cols].values.astype(np.float64)
        put_values = row[put_cols].values.astype(np.float64)

        # Get the indices of non-NaN values
        call_non_nan_indices = np.where(~np.isnan(call_values))[0]
        put_non_nan_indices = np.where(~np.isnan(put_values))[0]

        # Create strike arrays based on column names
        call_strikes = np.array([int(col.split("_")[-1]) for col in call_cols])
        put_strikes = np.array([int(col.split("_")[-1]) for col in put_cols])

        # Function to fit: a * cosh((x - b) / c) + d
        def cosh_func(x, a, b, c, d):
            return a * np.cosh((x - b) / c) + d
        # def cosh_func(x, a, b, c):
        #     return a * np.cosh(((b*x) + c))
        # Initialize predictions with NaN
        call_preds = np.full(len(call_cols), np.nan)
        put_preds = np.full(len(put_cols), np.nan)

        # Fit cosh function to non-NaN call values if we have enough data points
        if len(call_non_nan_indices) >= 4:  # Need at least 4 points to fit 4 parameters
            try:
                # Initial parameter guesses
                p0 = [0.1, call_strikes.mean(), 1000,0.1]

                # Fit the function
                popt, _ = curve_fit(
                    cosh_func,
                    call_strikes[call_non_nan_indices],
                    call_values[call_non_nan_indices],
                    p0=p0,
                    maxfev=10000,
                )

                # Predict all values using the fitted function
                call_preds = cosh_func(call_strikes, *popt)
            except:
                # If curve_fit fails, use the mean of non-NaN values
                call_preds = np.full(len(call_cols), np.nanmean(call_values))

        # Fit cosh function to non-NaN put values if we have enough data points
        if len(put_non_nan_indices) >= 4:
            try:
                # Initial parameter guesses
                p0 = [0.1, put_strikes.mean(), 1000,0.1 ]

                # Fit the function
                popt, _ = curve_fit(
                    cosh_func,
                    put_strikes[put_non_nan_indices],
                    put_values[put_non_nan_indices],
                    p0=p0,
                    maxfev=10000,
                )

                # Predict all values using the fitted function
                put_preds = cosh_func(put_strikes, *popt)
            except:
                # If curve_fit fails, use the mean of non-NaN values
                put_preds = np.full(len(put_cols), np.nanmean(put_values))

        # Combine predictions
        all_preds = np.concatenate([call_preds, put_preds])

        return all_preds


# Import necessary function

# Get the call and put columns separately
call_cols = [col for col in pred_cols if col.startswith("call")]
put_cols = [col for col in pred_cols if col.startswith("put")]

# Create a DataFrame to store the predictions
preds = pd.DataFrame(index=sample_val.index, columns=pred_cols)

# Apply the cosh regression function to each row
for idx in sample_val.index:
    row = sample_val.loc[idx]

    # Get predictions for this row
    row_preds = fit_cosh_curve(row, call_cols, put_cols)

    # Store predictions in the DataFrame
    preds.loc[idx] = row_preds

# Calculate RMSE
rmse_value = rmse(val_Y, preds)
print(f"RMSE using Cosh Regression: {rmse_value}")

    #

  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a * np.cosh((x - b) / c) + d
  return a *

RMSE using Cosh Regression: 0.028130095739386827


In [4]:
preds

NameError: name 'preds' is not defined

## Predicting on test data

In [5]:
test_pred_cols = list(
    filter(
        lambda x: x.startswith("call") or x.startswith("put"), test_data.columns.tolist()
    )
)
len(test_pred_cols)

52

In [8]:


# Create a function to fit cosh regression and predict missing values
def fit_cosh_curve(row, call_cols, put_cols):
    # Separate call and put columns
    call_values = row[call_cols].values.astype(np.float64)
    put_values = row[put_cols].values.astype(np.float64)

    # Get the indices of non-NaN values
    call_non_nan_indices = np.where(~np.isnan(call_values))[0]
    put_non_nan_indices = np.where(~np.isnan(put_values))[0]

    # Create strike arrays based on column names
    call_strikes = np.array([int(col.split("_")[-1]) for col in call_cols])
    put_strikes = np.array([int(col.split("_")[-1]) for col in put_cols])

    # Function to fit: a * cosh((x - b) / c) + d
    def cosh_func(x, a, b, c, d):
        return a * np.cosh((x - b) / c) + d

    # def cosh_func(x, a, b, c):
    #     return a * np.cosh(((b*x) + c))
    # Initialize predictions with NaN
    call_preds = np.full(len(call_cols), np.nan)
    put_preds = np.full(len(put_cols), np.nan)

    # Fit cosh function to non-NaN call values if we have enough data points
    if len(call_non_nan_indices) >= 4:  # Need at least 4 points to fit 4 parameters
        try:
            # Initial parameter guesses
            p0 = [0.1, call_strikes.mean(), 0.1, 1000]

            # Fit the function
            popt, _ = curve_fit(
                cosh_func,
                call_strikes[call_non_nan_indices],
                call_values[call_non_nan_indices],
                p0=p0,
                maxfev=10000,
            )

            # Predict all values using the fitted function
            call_preds = cosh_func(call_strikes, *popt)
        except:
            # If curve_fit fails, use the mean of non-NaN values
            call_preds = np.full(len(call_cols), np.nanmedian(call_values))
    else:
        call_preds = np.full(len(call_cols), np.nanmedian(call_values))
    # Fit cosh function to non-NaN put values if we have enough data points
    if len(put_non_nan_indices) >= 4:
        try:
            # Initial parameter guesses
            p0 = [0.1, put_strikes.mean(), 0.1, 1000]

            # Fit the function
            popt, _ = curve_fit(
                cosh_func,
                put_strikes[put_non_nan_indices],
                put_values[put_non_nan_indices],
                p0=p0,
                maxfev=10000,
            )

            # Predict all values using the fitted function
            put_preds = cosh_func(put_strikes, *popt)
        except:
            # If curve_fit fails, use the mean of non-NaN values
            put_preds = np.full(len(put_cols), np.nanmedian(put_values))
    else:
        put_preds = np.full(len(put_cols), np.nanmedian(put_values))

    # Combine predictions
    all_preds = np.concatenate([call_preds, put_preds])

    return all_preds


# Import necessary function

# Get the call and put columns separately
call_cols = [col for col in test_pred_cols if col.startswith("call")]
put_cols = [col for col in test_pred_cols if col.startswith("put")]

# Create a DataFrame to store the predictions
preds = pd.DataFrame(index=test_data.index, columns=test_pred_cols)

# Apply the cosh regression function to each row
for idx in test_data.index:
    row = test_data.loc[idx]

    # Get predictions for this row
    row_preds = fit_cosh_curve(row, call_cols, put_cols)

    # Store predictions in the DataFrame
    preds.loc[idx] = row_preds

# Calculate RMSE
# rmse_value = rmse(val_Y, preds)
# print(f"RMSE using Cosh Regression: {rmse_value}")

#
test_data[preds.columns] = preds

  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, o

In [9]:
print(test_data.isna().sum().sum())
cosh_submission = test_data[submission_cols]
cosh_submission.to_csv(f"{output_path}/cosh_submission.csv", index=False)
cosh_submission

962


Unnamed: 0,timestamp,call_iv_24000,call_iv_24100,call_iv_24200,call_iv_24300,call_iv_24400,call_iv_24500,call_iv_24600,call_iv_24700,call_iv_24800,...,put_iv_24600,put_iv_24700,put_iv_24800,put_iv_24900,put_iv_25000,put_iv_25100,put_iv_25200,put_iv_25300,put_iv_25400,put_iv_25500
0,0,0.254209,0.254209,0.254209,0.254209,0.254209,0.254209,0.254209,0.254209,0.254209,...,0.242295,0.242295,0.242295,0.242295,0.242295,0.242295,0.242295,0.242295,0.242295,0.242295
1,1,0.258893,0.258893,0.258893,0.258893,0.258893,0.258893,0.258893,0.258893,0.258893,...,0.354323,0.354323,0.354323,0.354323,0.354323,0.354323,0.354323,0.354323,0.354323,0.354323
2,2,0.20458,0.20458,0.20458,0.20458,0.20458,0.20458,0.20458,0.20458,0.20458,...,0.222443,0.222443,0.222443,0.222443,0.222443,0.222443,0.222443,0.222443,0.222443,0.222443
3,3,0.230049,0.230049,0.230049,0.230049,0.230049,0.230049,0.230049,0.230049,0.230049,...,0.210733,0.210733,0.210733,0.210733,0.210733,0.210733,0.210733,0.210733,0.210733,0.210733
4,4,0.193893,0.193893,0.193893,0.193893,0.193893,0.193893,0.193893,0.193893,0.193893,...,0.2116,0.2116,0.2116,0.2116,0.2116,0.2116,0.2116,0.2116,0.2116,0.2116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12060,12060,0.186856,0.186856,0.186856,0.186856,0.186856,0.186856,0.186856,0.186856,0.186856,...,0.254285,0.254285,0.254285,0.254285,0.254285,0.254285,0.254285,0.254285,0.254285,0.254285
12061,12061,0.202897,0.202897,0.202897,0.202897,0.202897,0.202897,0.202897,0.202897,0.202897,...,0.179082,0.179082,0.179082,0.179082,0.179082,0.179082,0.179082,0.179082,0.179082,0.179082
12062,12062,0.227768,0.227768,0.227768,0.227768,0.227768,0.227768,0.227768,0.227768,0.227768,...,0.205947,0.205947,0.205947,0.205947,0.205947,0.205947,0.205947,0.205947,0.205947,0.205947
12063,12063,0.184492,0.184492,0.184492,0.184492,0.184492,0.184492,0.184492,0.184492,0.184492,...,0.255995,0.255995,0.255995,0.255995,0.255995,0.255995,0.255995,0.255995,0.255995,0.255995
