In [106]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import root_mean_squared_error as rmse
from alive_progress import alive_bar
import time
import os

# Load the data
train_data = pd.read_csv("../tmp/filtered_data.csv")
# test_data = pd.read_parquet("data/test_data.parquet")
test_data = pd.read_parquet("../data/test_data.parquet")

# Convert expiry to datetime if it's not already
if train_data["expiry"].dtype != "datetime64[ns]":
    train_data["expiry"] = pd.to_datetime(train_data["expiry"])

# Get the target expiry date
target_date = pd.Timestamp("2025-05-08").date()

In [107]:
train_data.isna().sum().sum()

315

In [108]:
# Impute median for NaN values in train_data
for col in train_data.columns:
    if train_data[col].dtype in ["float64", "int64"] and train_data[col].isna().any():
        median_val = train_data[col].median()
        train_data[col].fillna(median_val, inplace=True)
        print(
            f"Imputed {train_data[col].isna().sum()} NaN values in {col} with median: {median_val}"
        )

print(f"Remaining NaN values in train_data: {train_data.isna().sum().sum()}")

Imputed 0 NaN values in call_iv_23500 with median: 0.297625
Imputed 0 NaN values in call_iv_23600 with median: 0.277056
Imputed 0 NaN values in call_iv_23700 with median: 0.256366
Imputed 0 NaN values in call_iv_23800 with median: 0.235445
Imputed 0 NaN values in call_iv_23900 with median: 0.216572
Imputed 0 NaN values in call_iv_24000 with median: 0.196701
Imputed 0 NaN values in call_iv_26000 with median: 0.339635
Imputed 0 NaN values in put_iv_24300 with median: 0.14258349999999997
Imputed 0 NaN values in put_iv_24600 with median: 0.151955
Imputed 0 NaN values in put_iv_24800 with median: 0.17717650000000001
Imputed 0 NaN values in put_iv_24900 with median: 0.1833235
Imputed 0 NaN values in put_iv_25000 with median: 0.187438
Remaining NaN values in train_data: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting va

In [109]:
train_data.isna().sum().sum()

0

In [110]:
# # Filter rows with the specified expiry date
# expiry_filter = train_data["expiry"].dt.date == target_date
# target_rows = train_data[expiry_filter]

# # Get indices of rows to be used for validation (50% of the rows with target expiry)
# validation_indices = target_rows.sample(frac=0.3, random_state=43).index

# # Create validation set
# val_data = train_data.loc[validation_indices].copy()

# # Remove validation data from training set
# train_data = train_data.drop(validation_indices)

# # Print shapes to confirm
# print(f"Original training data shape: {len(train_data) + len(val_data)}")
# print(f"New training data shape: {train_data.shape}")
# print(f"Validation data shape: {val_data.shape}")
# print(f"Test data shape: {test_data.shape}")

In [111]:
# val_Y = pd.read_parquet("val_Y.parquet")
# sample_val = pd.read_parquet("sample_val.parquet")
# sample_val_matching = pd.read_csv("output/matching.csv")

In [112]:
# import xgboost as xgb
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error

# def get_prediction(target_col):
#     # Prepare features and target
#     feature_cols = ['underlying'] + [col for col in train_data.columns if col.startswith('X')]
#     X = train_data[feature_cols]
#     y = train_data[target_col]

#     # Split into train and validation
#     X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#     # Train XGBoost model
#     xgb_model = xgb.XGBRegressor(
#     n_estimators=100,
#     max_depth=6,
#     learning_rate=0.1,
#     random_state=42
#   )

#     xgb_model.fit(X_train, y_train)

#     # Make predictions
#     y_pred = xgb_model.predict(X_val)

#     # Calculate RMSE
#     rmse_score = rmse(y_val, y_pred)
#     print(f"RMSE for {target_col}: {rmse_score:.6f}")

#     # Predict on validation data
#     val_features = val_data[feature_cols]
#     val_predictions = xgb_model.predict(val_features)

#     print(f"Predictions shape: {val_predictions.shape}")
#     print(f"Sample predictions: {val_predictions[:5]}")


In [113]:
# get_prediction("call_iv_25000")

In [114]:
# # Convert expiry dates to integer labels
# unique_expiries = sorted(train_data['expiry'].unique())
# expiry_mapping = {expiry: i for i, expiry in enumerate(unique_expiries)}

# # Apply mapping to train_data
# train_data['expiry'] = train_data['expiry'].map(expiry_mapping)

# # Apply same mapping to val_data
# val_data['expiry'] = val_data['expiry'].map(expiry_mapping)

# print(f"Expiry mapping: {expiry_mapping}")
# print(f"Train data expiry values: {sorted(train_data['expiry'].unique())}")
# print(f"Val data expiry values: {sorted(val_data['expiry'].unique())}")

In [115]:
# sample_val.expiry = 2
# val_Y.expiry = 2

In [116]:
# sample_val

In [117]:
common_cols = [col for col in train_data.columns if col.startswith('call') and col in test_data.columns]
common_cols

['call_iv_24000',
 'call_iv_24100',
 'call_iv_24200',
 'call_iv_24300',
 'call_iv_24400',
 'call_iv_24500',
 'call_iv_24600',
 'call_iv_24700',
 'call_iv_24800',
 'call_iv_24900',
 'call_iv_25000',
 'call_iv_25100',
 'call_iv_25200',
 'call_iv_25300',
 'call_iv_25400',
 'call_iv_25500',
 'call_iv_25600',
 'call_iv_25700',
 'call_iv_25800',
 'call_iv_25900',
 'call_iv_26000']

In [118]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error as rmse


def train_xgb(given_df, test_df, col1, col2):
    # -----------------------------
    # Step 1: Setup
    # -----------------------------
    # Define your target
    target_col = col2

    # Add additional features
    feature_cols = [col1 ]+ ["underlying"]  # Add more if available

    # -----------------------------
    # Step 2: Prepare Training Data
    # -----------------------------
    # Drop rows with NaN in any input or the target
    given_df = given_df[feature_cols + [target_col]].dropna()

    X = given_df[feature_cols]
    y = given_df[target_col]

    # Optional: Normalize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # -----------------------------
    # Step 3: Train XGBoost Regressor
    # -----------------------------
    xgb_model = XGBRegressor(
        n_estimators=800,
        learning_rate=0.1,
        subsample=0.9,
        random_state=43,
    )

    xgb_model.fit(X_scaled, y)

    # -----------------------------
    # Step 4: Predict in Test Data
    # -----------------------------
    # Test: Make sure underlying and IVs (except target) are available
    test_data = test_df[feature_cols].copy()
    test_data_scaled = scaler.transform(test_data)

    # Predict target column
    predicted_call_iv_25000 = xgb_model.predict(test_data_scaled)

    # -----------------------------
    # Step 5: Impute into Test DataFrame
    # -----------------------------
    test_imputed = test_df.copy()
    missing_mask = test_imputed[target_col].isna()
    test_imputed.loc[missing_mask, target_col] = predicted_call_iv_25000[missing_mask]
    return test_imputed

In [119]:
pred_col_fin = [
    "call_iv_24000",
    "call_iv_24700",
    "call_iv_24800",
    "call_iv_25000",
    "call_iv_25200",
    "call_iv_25300",
    "call_iv_25400",
    "call_iv_25500",
    "call_iv_25600",
    "call_iv_25700",
    "call_iv_25800",
    "call_iv_25900",
    "call_iv_26000",
]

In [120]:
test_data.shape

(12065, 96)

In [121]:
test_data.isna().sum().sum()

376504

In [122]:
for i in range(common_cols.__len__()-1):
    if(common_cols[i+1] not in pred_col_fin):
        continue
    print("Predicting for:", common_cols[i], "->", common_cols[i + 1])
    test_data[~test_data[common_cols[i]].isna()] = train_xgb(
        train_data,
        test_data[~test_data[common_cols[i]].isna()],
        common_cols[i],
        common_cols[i + 1],
    )

Predicting for: call_iv_24600 -> call_iv_24700
Predicting for: call_iv_24700 -> call_iv_24800
Predicting for: call_iv_24900 -> call_iv_25000
Predicting for: call_iv_25100 -> call_iv_25200
Predicting for: call_iv_25200 -> call_iv_25300
Predicting for: call_iv_25300 -> call_iv_25400
Predicting for: call_iv_25400 -> call_iv_25500
Predicting for: call_iv_25500 -> call_iv_25600
Predicting for: call_iv_25600 -> call_iv_25700
Predicting for: call_iv_25700 -> call_iv_25800
Predicting for: call_iv_25800 -> call_iv_25900
Predicting for: call_iv_25900 -> call_iv_26000


In [123]:
test_data.isna().sum().sum()

311488

In [124]:
common_cols2 = [
    col
    for col in train_data.columns
    if col.startswith("put") and col in test_data.columns
]
common_cols2

['put_iv_23000',
 'put_iv_23100',
 'put_iv_23200',
 'put_iv_23300',
 'put_iv_23400',
 'put_iv_23500',
 'put_iv_23600',
 'put_iv_23700',
 'put_iv_23800',
 'put_iv_23900',
 'put_iv_24000',
 'put_iv_24100',
 'put_iv_24200',
 'put_iv_24300',
 'put_iv_24400',
 'put_iv_24500',
 'put_iv_24600',
 'put_iv_24700',
 'put_iv_24800',
 'put_iv_24900',
 'put_iv_25000']

In [125]:
pred_col_fin2 = ['put_iv_23000',
 'put_iv_23100',
 'put_iv_23200',
 'put_iv_23300',
 'put_iv_23400',
 'put_iv_23500',
 'put_iv_23600',
 'put_iv_23700',
 'put_iv_23800',
 'put_iv_23900',
 'put_iv_24000',]

In [126]:
for i in range(common_cols2.__len__() - 1):
    if common_cols2[i + 1] not in pred_col_fin2:
        continue
    print("Predicting for:", common_cols2[i], "->", common_cols2[i + 1])
    test_data[~test_data[common_cols2[i]].isna()] = train_xgb(
        train_data,
        test_data[~test_data[common_cols2[i]].isna()],
        common_cols2[i],
        common_cols2[i + 1],
    )

Predicting for: put_iv_23000 -> put_iv_23100
Predicting for: put_iv_23100 -> put_iv_23200
Predicting for: put_iv_23200 -> put_iv_23300
Predicting for: put_iv_23300 -> put_iv_23400
Predicting for: put_iv_23400 -> put_iv_23500
Predicting for: put_iv_23500 -> put_iv_23600
Predicting for: put_iv_23600 -> put_iv_23700
Predicting for: put_iv_23700 -> put_iv_23800
Predicting for: put_iv_23800 -> put_iv_23900
Predicting for: put_iv_23900 -> put_iv_24000


In [127]:
test_data.isna().sum().sum()

250059

In [128]:
test_data.to_csv("finally_trained_test.csv", index=False)

In [129]:
test_data[[col for col in test_data.columns if col.startswith("call") or col.startswith("put")]].to_csv(
    "trained_matching_input.csv", index=False, header=False);