In [217]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import root_mean_squared_error as rmse
from alive_progress import alive_bar
import time
import os

# Load the data
train_data = pd.read_csv("../tmp/filtered_data.csv")
# test_data = pd.read_parquet("data/test_data.parquet")
test_data = pd.read_parquet("val_Y.parquet")

# Convert expiry to datetime if it's not already
if train_data["expiry"].dtype != "datetime64[ns]":
    train_data["expiry"] = pd.to_datetime(train_data["expiry"])

# Get the target expiry date
target_date = pd.Timestamp("2025-05-08").date()

In [218]:
# Filter rows with the specified expiry date
expiry_filter = train_data["expiry"].dt.date == target_date
target_rows = train_data[expiry_filter]

# Get indices of rows to be used for validation (50% of the rows with target expiry)
validation_indices = target_rows.sample(frac=0.3, random_state=43).index

# Create validation set
val_data = train_data.loc[validation_indices].copy()

# Remove validation data from training set
train_data = train_data.drop(validation_indices)

# Print shapes to confirm
print(f"Original training data shape: {len(train_data) + len(val_data)}")
print(f"New training data shape: {train_data.shape}")
print(f"Validation data shape: {val_data.shape}")
print(f"Test data shape: {test_data.shape}")

Original training data shape: 178340
New training data shape: (154587, 97)
Validation data shape: (23753, 97)
Test data shape: (23753, 97)


In [219]:
val_Y = pd.read_parquet("val_Y.parquet")
sample_val = pd.read_parquet("sample_val.parquet")
sample_val_matching = pd.read_csv("output/matching.csv")

In [220]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

def get_prediction(target_col):
    # Prepare features and target
    feature_cols = ['underlying'] + [col for col in train_data.columns if col.startswith('X')]
    X = train_data[feature_cols]
    y = train_data[target_col]

    # Split into train and validation
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train XGBoost model
    xgb_model = xgb.XGBRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
  )

    xgb_model.fit(X_train, y_train)

    # Make predictions
    y_pred = xgb_model.predict(X_val)

    # Calculate RMSE
    rmse_score = rmse(y_val, y_pred)
    print(f"RMSE for {target_col}: {rmse_score:.6f}")

    # Predict on validation data
    val_features = val_data[feature_cols]
    val_predictions = xgb_model.predict(val_features)

    print(f"Predictions shape: {val_predictions.shape}")
    print(f"Sample predictions: {val_predictions[:5]}")


In [221]:
get_prediction("call_iv_25000")

RMSE for call_iv_25000: 0.096926
Predictions shape: (23753,)
Sample predictions: [0.17303367 0.14510675 0.18535015 0.22857085 0.23733236]


In [222]:
# Convert expiry dates to integer labels
unique_expiries = sorted(train_data['expiry'].unique())
expiry_mapping = {expiry: i for i, expiry in enumerate(unique_expiries)}

# Apply mapping to train_data
train_data['expiry'] = train_data['expiry'].map(expiry_mapping)

# Apply same mapping to val_data
val_data['expiry'] = val_data['expiry'].map(expiry_mapping)

print(f"Expiry mapping: {expiry_mapping}")
print(f"Train data expiry values: {sorted(train_data['expiry'].unique())}")
print(f"Val data expiry values: {sorted(val_data['expiry'].unique())}")

Expiry mapping: {Timestamp('2025-04-24 00:00:00'): 0, Timestamp('2025-04-30 00:00:00'): 1, Timestamp('2025-05-08 00:00:00'): 2}
Train data expiry values: [0, 1, 2]
Val data expiry values: [2]


In [223]:
sample_val.expiry = 2
val_Y.expiry = 2

In [224]:
sample_val

Unnamed: 0,timestamp,underlying,expiry,call_iv_23500,call_iv_23600,call_iv_23700,call_iv_23800,call_iv_23900,call_iv_24000,call_iv_24100,...,X32,X33,X34,X35,X36,X37,X38,X39,X40,X41
158881,1746590742000000000,24433.0,2,0.435793,0.409610,0.383775,0.356050,,,,...,-0.022656,0.006262,2.553592e+06,-0.001931,2.372125,0.068108,-1.522534e+06,-3.048516e+06,1.254699e+06,-0.000000e+00
117518,1746176782000000000,24285.5,2,,,0.187751,0.178467,0.170073,0.161772,0.154975,...,0.003836,-0.004734,6.693594e+05,0.010055,1.315715,0.030237,5.235704e+04,-8.124241e+06,1.313372e+06,-1.000000e-06
155403,1746520665000000000,24387.5,2,,0.270770,,,,0.173109,,...,0.001037,0.003309,2.847738e+06,0.008796,0.648142,0.041674,3.459928e+05,-2.764208e+07,2.203250e+06,0.000000e+00
159498,1746591359000000000,24384.6,2,0.426385,0.396069,,0.341066,,,0.242813,...,0.035097,-0.030470,-3.629051e+06,0.146150,-8.941554,1.278703,6.211531e+06,7.623811e+06,-5.587995e+05,3.208558e+06
106693,1746165957000000000,24329.3,2,,,0.196625,,,,,...,0.001900,0.002318,-1.051215e+06,0.026776,0.242156,0.025405,2.716713e+06,-3.698245e+06,-0.000000e+00,-0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159879,1746591740000000000,24335.0,2,,0.377406,0.349971,,,0.256396,,...,0.008603,-0.004291,-1.986960e+06,0.012081,-1.567061,0.025376,-1.849046e+05,9.476678e+05,-8.574375e+05,-0.000000e+00
152103,1746517365000000000,24379.3,2,0.281599,0.263384,0.241601,,,,0.149054,...,-0.009330,0.001730,3.085312e+05,-0.010100,0.017339,0.041731,-5.464379e+06,9.938785e+06,0.000000e+00,0.000000e+00
145579,1746510841000000000,24383.8,2,0.277396,0.256449,,0.214142,0.191823,0.168970,0.146202,...,-0.006017,-0.014316,-5.171651e+05,0.009618,-0.905712,-0.035879,-5.117804e+05,-3.953448e+07,1.914297e+05,-0.000000e+00
171749,1746603610000000000,24348.2,2,0.386717,0.355572,0.323200,0.296829,,,0.190997,...,0.001378,0.003366,1.357577e+06,0.016415,0.766966,-0.016124,-9.293144e+05,6.414991e+06,3.323359e+04,0.000000e+00


In [225]:
common_cols = [col for col in train_data.columns if col.startswith('call') and col in test_data.columns]
common_cols

['call_iv_23500',
 'call_iv_23600',
 'call_iv_23700',
 'call_iv_23800',
 'call_iv_23900',
 'call_iv_24000',
 'call_iv_24100',
 'call_iv_24200',
 'call_iv_24300',
 'call_iv_24400',
 'call_iv_24500',
 'call_iv_24600',
 'call_iv_24700',
 'call_iv_24800',
 'call_iv_24900',
 'call_iv_25000',
 'call_iv_25100',
 'call_iv_25200',
 'call_iv_25300',
 'call_iv_25400',
 'call_iv_25500',
 'call_iv_25600',
 'call_iv_25700',
 'call_iv_25800',
 'call_iv_25900',
 'call_iv_26000']

In [226]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error as rmse


def train_xgb(given_df, col1, col2):
    # -----------------------------
    # Step 1: Setup
    # -----------------------------
    # Define your target
    target_col = col2

    # Add additional features
    feature_cols = [col1 ]+ ["underlying", "expiry"]  # Add more if available

    # -----------------------------
    # Step 2: Prepare Training Data
    # -----------------------------
    # Drop rows with NaN in any input or the target
    given_df = given_df[feature_cols + [target_col]].dropna()

    X = given_df[feature_cols]
    y = given_df[target_col]

    # Optional: Normalize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # -----------------------------
    # Step 3: Train XGBoost Regressor
    # -----------------------------
    xgb_model = XGBRegressor(
        n_estimators=800,
        learning_rate=0.1,
        subsample=0.9,
        random_state=43,
    )

    xgb_model.fit(X_scaled, y)

    # -----------------------------
    # Step 4: Predict in Test Data
    # -----------------------------
    # Test: Make sure underlying and IVs (except target) are available
    test_data = val_Y[feature_cols].copy()
    test_data_scaled = scaler.transform(test_data)

    # Predict target column
    predicted_call_iv_25000 = xgb_model.predict(test_data_scaled)

    # -----------------------------
    # Step 5: Impute into Test DataFrame
    # -----------------------------
    test_imputed = sample_val.copy()
    missing_mask = test_imputed[target_col].isna()
    test_imputed.loc[missing_mask, target_col] = predicted_call_iv_25000[missing_mask]
    print(col2, rmse(val_Y[target_col], test_imputed[target_col]))

In [227]:
for i in range(common_cols.__len__()-1):
    train_xgb(train_data, common_cols[i], common_cols[i+1])

call_iv_23600 0.0019634675273223702
call_iv_23700 0.001617209676355377
call_iv_23800 0.0011148703205429453
call_iv_23900 0.0008918749699855635
call_iv_24000 0.0007911677166453678
call_iv_24100 0.0015873125821282662
call_iv_24200 0.0011683634481923371
call_iv_24300 0.002064869919112093
call_iv_24400 0.001039063169097482
call_iv_24500 0.0013106320889633184
call_iv_24600 0.0018321202596570658
call_iv_24700 0.0009481653714509304
call_iv_24800 0.0007197158958654638
call_iv_24900 0.0014947660925306674
call_iv_25000 0.0008678682288392795
call_iv_25100 0.0006459108472106488
call_iv_25200 0.0004970628666050636
call_iv_25300 0.00043788584321932824
call_iv_25400 0.0005733690518143546
call_iv_25500 0.0006475355415838873
call_iv_25600 0.0006518079474414109
call_iv_25700 0.0005980948522618613
call_iv_25800 0.0005668770017225137
call_iv_25900 0.0006245723634674341
call_iv_26000 0.0007521968334968915


In [228]:
test_data = pd.read_parquet("../data/test_data.parquet")
common_cols = [col for col in train_data.columns if (col.startswith('call') or col.startswith('put')) and col in test_data.columns]

In [231]:
# Impute median for NaN values in train_data
for col in train_data.columns:
  if train_data[col].dtype in ['float64', 'int64'] and train_data[col].isna().any():
    median_val = train_data[col].median()
    train_data[col].fillna(median_val, inplace=True)
    print(f"Imputed {train_data[col].isna().sum()} NaN values in {col} with median: {median_val}")

print(f"Remaining NaN values in train_data: {train_data.isna().sum().sum()}")

Imputed 0 NaN values in call_iv_23500 with median: 0.30767049999999996
Imputed 0 NaN values in call_iv_23600 with median: 0.286101
Imputed 0 NaN values in call_iv_23700 with median: 0.2657875
Imputed 0 NaN values in call_iv_23800 with median: 0.244635
Imputed 0 NaN values in call_iv_23900 with median: 0.22357349999999998
Imputed 0 NaN values in call_iv_24000 with median: 0.1987605
Imputed 0 NaN values in call_iv_26000 with median: 0.342125
Imputed 0 NaN values in put_iv_24300 with median: 0.143855
Imputed 0 NaN values in put_iv_24600 with median: 0.15524149999999998
Imputed 0 NaN values in put_iv_24800 with median: 0.18014
Imputed 0 NaN values in put_iv_24900 with median: 0.188084
Imputed 0 NaN values in put_iv_25000 with median: 0.191225
Remaining NaN values in train_data: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting va

315

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error as rmse
from alive_progress import alive_bar
import time
import warnings

warnings.filterwarnings("ignore")


def train_xgb_with_gridsearch(given_df, col1, col2, use_gridsearch=True, verbose=True):
    """
    Train XGBoost model with optional grid search for hyperparameter optimization

    Parameters:
    -----------
    given_df : DataFrame
        Training data
    col1 : str
        Input IV column (previous strike)
    col2 : str
        Target IV column (current strike)
    use_gridsearch : bool
        Whether to perform grid search for hyperparameter tuning
    verbose : bool
        Whether to print detailed results

    Returns:
    --------
    dict : Contains model, scaler, parameters, and performance metrics
    """

    # Setup
    target_col = col2
    feature_cols = [col1] + ["underlying", "expiry"]

    # Prepare training data - drop rows with NaN
    train_clean = given_df[feature_cols + [target_col]].dropna()

    if len(train_clean) == 0:
        if verbose:
            print(f"Warning: No valid training data for {col2}")
        return None

    X = train_clean[feature_cols]
    y = train_clean[target_col]

    # Normalize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split for validation
    X_train, X_val, y_train, y_val = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42
    )

    if use_gridsearch:
        # Define parameter grid for grid search
        param_grid = {
            "n_estimators": [300, 500, 800],
            "learning_rate": [0.05, 0.1, 0.15],
            "max_depth": [4, 6, 8],
            "subsample": [0.8, 0.9, 1.0],
            # "colsample_bytree": [0.8, 0.9, 1.0],
            # "reg_alpha": [0, 0.1, 0.5],
            # "reg_lambda": [1, 1.5, 2],
        }

        # Create base XGBoost model
        xgb_base = XGBRegressor(random_state=42, n_jobs=-1, verbosity=0)

        # Setup cross-validation
        cv = KFold(n_splits=3, shuffle=True, random_state=42)

        # Perform grid search
        if verbose:
            print(f"Performing grid search for {col2}...")

        grid_search = GridSearchCV(
            estimator=xgb_base,
            param_grid=param_grid,
            cv=cv,
            scoring="neg_root_mean_squared_error",
            n_jobs=-1,
            verbose=0,
        )

        # Fit grid search
        grid_search.fit(X_train, y_train)

        # Get best model
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_

    else:
        # Use default parameters
        best_params = {
            "n_estimators": 800,
            "learning_rate": 0.1,
            "max_depth": 6,
            "subsample": 0.9,
            "colsample_bytree": 1.0,
            "reg_alpha": 0,
            "reg_lambda": 1,
        }

        best_model = XGBRegressor(
            **best_params, random_state=42, n_jobs=-1, verbosity=0
        )

        best_model.fit(X_train, y_train)

    # Make predictions on validation set
    y_pred_val = best_model.predict(X_val)

    # Calculate metrics
    rmse_score = rmse(y_val, y_pred_val)
    r2 = r2_score(y_val, y_pred_val)

    # Predict on test data (val_Y)
    test_data = val_Y[feature_cols].copy()
    test_data_scaled = scaler.transform(test_data)
    predicted_values = best_model.predict(test_data_scaled)

    # Calculate test RMSE if ground truth is available
    test_rmse = rmse(val_Y[target_col], predicted_values)

    if verbose:
        print(f"\n=== Results for {col2} ===")
        print(f"Training samples: {len(train_clean)}")
        print(f"Validation RMSE: {rmse_score:.6f}")
        print(f"Validation R²: {r2:.6f}")
        print(f"Test RMSE: {test_rmse:.6f}")
        if use_gridsearch:
            print(f"Best parameters: {best_params}")
        print("-" * 50)

    return {
        "model": best_model,
        "scaler": scaler,
        "best_params": best_params,
        "validation_rmse": rmse_score,
        "validation_r2": r2,
        "test_rmse": test_rmse,
        "predictions": predicted_values,
        "feature_cols": feature_cols,
        "target_col": target_col,
    }


def train_all_strikes_with_gridsearch(
    train_data, val_Y, common_cols, use_gridsearch=True
):
    """
    Train XGBoost models for all strike prices with grid search optimization

    Parameters:
    -----------
    train_data : DataFrame
        Training dataset
    val_Y : DataFrame
        Validation/test dataset with ground truth
    common_cols : list
        List of call IV column names
    use_gridsearch : bool
        Whether to use grid search for hyperparameter optimization

    Returns:
    --------
    dict : Dictionary containing results for each strike price
    """

    results = {}

    print(f"Training models for {len(common_cols)-1} strike pairs...")
    print(f"Grid search: {'Enabled' if use_gridsearch else 'Disabled'}")
    print("=" * 60)

    with alive_bar(len(common_cols) - 1) as bar:
        for i in range(len(common_cols) - 1):
            col1 = common_cols[i]  # Input IV (previous strike)
            col2 = common_cols[i + 1]  # Target IV (next strike)

            try:
                result = train_xgb_with_gridsearch(
                    train_data,
                    col1,
                    col2,
                    use_gridsearch=use_gridsearch,
                    verbose=False,  # Set to True for detailed output
                )

                if result is not None:
                    results[col2] = result
                    print(f"{col2}: Test RMSE = {result['test_rmse']:.6f}")

            except Exception as e:
                print(f"Error training model for {col2}: {str(e)}")

            bar()

    return results


def create_summary_report(results):
    """
    Create a summary report of all model results
    """
    if not results:
        print("No results to summarize")
        return

    print("\n" + "=" * 80)
    print("SUMMARY REPORT - XGBoost Models with Grid Search")
    print("=" * 80)

    # Create summary DataFrame
    summary_data = []
    for strike, result in results.items():
        summary_data.append(
            {
                "Strike": strike,
                "Val_RMSE": result["validation_rmse"],
                "Val_R2": result["validation_r2"],
                "Test_RMSE": result["test_rmse"],
                "N_Estimators": result["best_params"]["n_estimators"],
                "Learning_Rate": result["best_params"]["learning_rate"],
                "Max_Depth": result["best_params"]["max_depth"],
                "Subsample": result["best_params"]["subsample"],
            }
        )

    summary_df = pd.DataFrame(summary_data)

    print(f"\nOverall Performance:")
    print(f"Average Test RMSE: {summary_df['Test_RMSE'].mean():.6f}")
    print(
        f"Best Test RMSE: {summary_df['Test_RMSE'].min():.6f} ({summary_df.loc[summary_df['Test_RMSE'].idxmin(), 'Strike']})"
    )
    print(
        f"Worst Test RMSE: {summary_df['Test_RMSE'].max():.6f} ({summary_df.loc[summary_df['Test_RMSE'].idxmax(), 'Strike']})"
    )

    print(f"\nDetailed Results:")
    print(summary_df.to_string(index=False, float_format="%.6f"))

    return summary_df


def save_models(results, filepath_prefix="xgb_models"):
    """
    Save trained models and scalers to files
    """
    import pickle

    for strike, result in results.items():
        # Save model
        model_path = f"{filepath_prefix}_{strike}_model.pkl"
        with open(model_path, "wb") as f:
            pickle.dump(result["model"], f)

        # Save scaler
        scaler_path = f"{filepath_prefix}_{strike}_scaler.pkl"
        with open(scaler_path, "wb") as f:
            pickle.dump(result["scaler"], f)

    print(f"Saved {len(results)} models and scalers with prefix '{filepath_prefix}'")


# Example usage (assuming your data is already loaded):
# Uncomment and modify these lines based on your actual data setup

"""
# Load your data (modify paths as needed)
train_data = pd.read_parquet("../data/train_data.parquet")
val_Y = pd.read_parquet("val_Y.parquet")

# Define common columns
common_cols = [col for col in train_data.columns if col.startswith('call') and col in val_Y.columns]

# Train models with grid search (this will take longer but give better results)
results_with_gridsearch = train_all_strikes_with_gridsearch(
    train_data, val_Y, common_cols, use_gridsearch=True
)

# Create summary report
summary = create_summary_report(results_with_gridsearch)

# Save models (optional)
save_models(results_with_gridsearch, "optimized_xgb_models")

# For faster training without grid search:
# results_fast = train_all_strikes_with_gridsearch(
#     train_data, val_Y, common_cols, use_gridsearch=False
# )
"""


'\n# Load your data (modify paths as needed)\ntrain_data = pd.read_parquet("../data/train_data.parquet")\nval_Y = pd.read_parquet("val_Y.parquet")\n\n# Define common columns\ncommon_cols = [col for col in train_data.columns if col.startswith(\'call\') and col in val_Y.columns]\n\n# Train models with grid search (this will take longer but give better results)\nresults_with_gridsearch = train_all_strikes_with_gridsearch(\n    train_data, val_Y, common_cols, use_gridsearch=True\n)\n\n# Create summary report\nsummary = create_summary_report(results_with_gridsearch)\n\n# Save models (optional)\nsave_models(results_with_gridsearch, "optimized_xgb_models")\n\n# For faster training without grid search:\n# results_fast = train_all_strikes_with_gridsearch(\n#     train_data, val_Y, common_cols, use_gridsearch=False\n# )\n'

In [242]:
results_optimized = train_all_strikes_with_gridsearch(
    train_data, val_Y, common_cols, use_gridsearch=True
)

# Create summary report
summary = create_summary_report(results_optimized)

# Save models for later use
save_models(results_optimized, "optimized_xgb_models")

Training models for 41 strike pairs...
Grid search: Enabled
|⚠︎                                       | (!) 0/41 [0%] in 37:54.2 (0.00/s) 


KeyboardInterrupt: 